Code/Phonemizer.cs
#nullable enable
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using static SharpTalk.AudioProcessor;
using static SharpTalk.Phonemizer.Normalizer;

namespace SharpTalk
{

    public class Phonemizer
    {
        readonly DictReader _dict;
        readonly DictReader _symbols;

        public int StatDict  { get; private set; }
        public int StatMorph { get; private set; }
        public int StatLts   { get; private set; }
        public void ResetStats() { StatDict = StatMorph = StatLts = 0; }
        public DictReader Dict => _dict;

        // Opcodes that are control codes, not actual phonemes (56-72)
        const byte OP_STRESS1 = 56;  // _Stress1_  -> kPrimaryStress
        const byte OP_STRESS2 = 57;  // _Stress2_  -> kSecondaryStress
        const byte OP_EMPHSTRESS = 58;  // _EmphStress_ -> kEmphaticStress
        const byte OP_SYLL = 63;  // _Syll_     -> kSyllable_Start
        const byte OP_WORD = 64;  // _Word_     -> kWord_Start
        const byte OP_PREP = 65;  // _Prep_     -> kPrep_Start
        const byte OP_VERB = 66;  // _Verb_     -> kVerb_Start
        const byte OP_COMMA = 67;  // _Comma_
        const byte OP_PERIOD = 68;  // _Period_
        const byte OP_QUEST = 69;  // _Quest_
        const byte OP_EXCLAM = 70;  // _Exclam_

        // Function words do NOT receive kContent_Word, primary dict stress is
        // suppressed so they don't drive pitch peaks in the BackEnd pitch algorithm.
        // Mirrors POS-based content/function distinction.
        static readonly HashSet<string> FuncWords = new(StringComparer.OrdinalIgnoreCase)
    {
        // articles / determiners
        "a", "an", "the",
        // prepositions
        "of", "in", "on", "at", "by", "for", "to", "up", "as", "into",
        "from", "with", "about", "over", "under", "out", "off", "than",
        // coordinating conjunctions
        "and", "or", "but", "nor", "yet", "so",
        // subordinating conjunctions
        "if", "that", "than", "when", "while", "because", "though",
        "although", "unless", "until", "since", "after", "before",
        // auxiliaries & copula
        "be", "am", "is", "are", "was", "were", "been", "being",
        "have", "has", "had", "do", "does", "did",
        "will", "would", "could", "should", "may", "might", "shall",
        "can", "must", "ought",
        // subject / object pronouns
        "i", "he", "she", "we", "they", "you", "it",
        "me", "him", "her", "us", "them",
        // possessive determiners
        "my", "your", "his", "its", "our", "their",
        // other function words
        "not", "no", "there", "here",
    };

        static readonly Regex TokenRe = new(
            @"(\d+)|([a-zA-Z]+(?:'[a-zA-Z]+)*)|([,;:])|(\.\.\.|[.!?~])|(\s+)",
            RegexOptions.Compiled);

        static readonly Regex CamelSplit = new(
            @"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])",
            RegexOptions.Compiled);

#if !SANDBOX
        public Phonemizer() : this(LibraryData.EnglishLex, LibraryData.Symbols) { }
#endif

        public Phonemizer(byte[] dictData, byte[] symbolsData)
        {
            _dict = new DictReader(dictData);
            _symbols = new DictReader(symbolsData);
        }

        public short LastEndPunct { get; private set; } = _Period_;

        public (PhonemeToken[] Tokens, short EndPunct)[] TextToSentenceTokens(string text)
        {
            var result = new List<(PhonemeToken[], short)>();
            var segments = EmbeddedCmd.ParseSegments(text);

            foreach (var seg in segments)
            {
                if (seg.IsCommand) continue; // handled by TtsEngine, not FrontEnd

                if (seg.IsSinging)
                {
                    // Each singing block is its own clause  never mix with speech
                    if (seg.Singing!.Count > 0)
                        result.Add((seg.Singing.ToArray(), 0));
                    continue;
                }

                // Split at sentence boundaries (.!?) and clause boundaries (,;:).
                // Each clause gets its own BackEnd.Process call so pitch resets cleanly.
                string plain = Normalize(seg.PlainText!);
                int start = 0;
                foreach (Match m in TokenRe.Matches(plain))
                {
                    if (!m.Groups[4].Success && !m.Groups[3].Success) continue;
                    string sentence = plain[start..(m.Index + m.Length)];
                    var tokens = TextSegmentToPhonemes(sentence);
                    result.Add((tokens, LastEndPunct));
                    start = m.Index + m.Length;
                }
                if (start < plain.Length)
                {
                    string remaining = plain[start..];
                    if (remaining.Trim().Length > 0)
                    {
                        var tokens = TextSegmentToPhonemes(remaining);
                        result.Add((tokens, LastEndPunct));
                    }
                }
            }

            if (result.Count == 0)
            {
                var tokens = TextToPhonemes(text);
                result.Add((tokens, LastEndPunct));
            }

            return result.ToArray();
        }

        // Process a pure-text span (no embedded commands) into phoneme tokens.
        private PhonemeToken[] TextSegmentToPhonemes(string text)
        {
            var tokens = new List<PhonemeToken>();
            LastEndPunct = _Period_;

            var matches = TokenRe.Matches(Normalize(text));

            var ctxWords = new List<string>();
            foreach (Match wm in matches)
                if (wm.Groups[2].Success) ctxWords.Add(wm.Groups[2].Value.ToUpperInvariant());
            int wordIdx = 0;

            foreach (Match m in matches)
            {
                if (m.Groups[1].Success)
                {
                    if (long.TryParse(m.Groups[1].Value, out long n))
                        AppendWordTokens(tokens, NumberToPhonStream(n), isContent: true);
                }
                else if (m.Groups[2].Success)
                {
                    string word = m.Groups[2].Value;
                    string upper = word.ToUpperInvariant();
                    byte[]? stream = HeteronymResolver.Resolve(ctxWords, wordIdx);
                    if (stream == null && IsAllCaps(word) && _dict.Search(upper) == null)
                        stream = SpellOutAcronym(upper);
                    stream ??= WordToPhonStream(upper);
                    AppendWordTokens(tokens, stream, !FuncWords.Contains(word));
                    wordIdx++;
                }
                else if (m.Groups[3].Success)
                {
                    tokens.Add(new PhonemeToken
                    {
                        Phon = _SIL_,
                        Ctrl = kTerm_Bound | ((long)kBND_Pause << kSilenceTypeShift),
                    });
                    LastEndPunct = _Comma_;
                }
                else if (m.Groups[4].Success)
                {
                    char p = m.Groups[4].Value[0];
                    string p4 = m.Groups[4].Value;
                    LastEndPunct = p4 == "..." ? _Ellipsis_
                                 : p4 == "?"   ? _Quest_
                                 : p4 == "!"   ? _Exclam_
                                 : p4 == "~"   ? _Tilde_
                                 :               _Period_;
                }
            }

            return tokens.ToArray();
        }

        public PhonemeToken[] TextToPhonemes(string text)
        {
            var tokens = new List<PhonemeToken>();
            LastEndPunct = _Period_;

            // Split into ordered segments (plain text spans interleaved with singing blocks)
            var segments = EmbeddedCmd.ParseSegments(text);

            foreach (var seg in segments)
            {
                if (seg.IsCommand) continue; // handled by TtsEngine, not FrontEnd

                if (seg.IsSinging)
                {
                    tokens.AddRange(seg.Singing!);
                    continue;
                }

                var matches = TokenRe.Matches(Normalize(seg.PlainText!));

                // Pre-extract word list for heteronym context resolution.
                var ctxWords = new List<string>();
                foreach (Match wm in matches)
                    if (wm.Groups[2].Success) ctxWords.Add(wm.Groups[2].Value.ToUpperInvariant());
                int wordIdx = 0;

                foreach (Match m in matches)
                {
                    if (m.Groups[1].Success)           // number
                    {
                        if (long.TryParse(m.Groups[1].Value, out long n))
                            AppendWordTokens(tokens, NumberToPhonStream(n), isContent: true);
                    }
                    else if (m.Groups[2].Success)      // word
                    {
                        string word = m.Groups[2].Value;
                        bool isContent = !FuncWords.Contains(word);
                        var stream = HeteronymResolver.Resolve(ctxWords, wordIdx)
                                     ?? WordToPhonStream(word.ToUpperInvariant());
                        AppendWordTokens(tokens, stream, isContent);
                        wordIdx++;
                    }
                    else if (m.Groups[3].Success)      // , ;
                    {
                        tokens.Add(new PhonemeToken
                        {
                            Phon = _SIL_,
                            Ctrl = kTerm_Bound | ((long)kBND_Pause << kSilenceTypeShift),
                        });
                        LastEndPunct = _Comma_;
                    }
                    else if (m.Groups[4].Success)      // ... . ! ? ~
                    {
                        string p4 = m.Groups[4].Value;
                        LastEndPunct = p4 == "..." ? _Ellipsis_
                                     : p4 == "?"   ? _Quest_
                                     : p4 == "!"   ? _Exclam_
                                     : p4 == "~"   ? _Tilde_
                                     :               _Period_;
                    }
                    // whitespace: skip
                }
            }

            return tokens.ToArray();
        }

        // Text normalization
        // Nested static class keeps normalizer state (regexes, tables) out of the
        // FrontEnd field list without a separate file.
        internal static class Normalizer
        {
            // Repeated-syllable words: "hahaha" -> "ha ha ha", "lolol" -> "lol ol"
            // Fires for 3+ repetitions of a 1–3 char unit. Rare in real English at that count.
            // Non-greedy {1,3}? so "iiiiiiiii" splits on "i" not "iii".
            static readonly Regex ReReduplicate = new(
                @"\b([a-zA-Z]{1,3}?)\1{2,}\b", RegexOptions.Compiled);

            static readonly Regex ReCurrency = new(
                @"\$\s*(\d+)(?:\.(\d{1,2}))?", RegexOptions.Compiled);
            static readonly Regex RePercent = new(
                @"(\d+)\s*%", RegexOptions.Compiled);
            static readonly Regex ReOrdinal = new(
                @"\b(\d+)\s*(?:st|nd|rd|th)\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            static readonly Regex ReDecimal = new(
                @"\b(\d+)\.(\d+)\b", RegexOptions.Compiled);
            // Years, 4-digit numbers in 1000–2099, not preceded by $ or another digit,
            // not followed by ordinal suffixes, %, or another digit.
            static readonly Regex ReYear = new(
                @"(?<![.$€£\d])\b(1\d{3}|20\d{2})\b(?!\s*(?:st|nd|rd|th|%|\d))",
                RegexOptions.Compiled | RegexOptions.IgnoreCase);
            // Dotted abbreviations (e.g., i.e., a.m.) — matched before the regular
            // abbreviation pass so their embedded periods don't split sentences.
            static readonly Regex ReAbbrevDotted = new(
                @"(?<!\w)(i\.e\.|e\.g\.|a\.m\.|p\.m\.|p\.s\.|w\.r\.t\.|b\.c\.|a\.d\.)(?!\w)",
                RegexOptions.Compiled | RegexOptions.IgnoreCase);

            static readonly Dictionary<string, string> AbbrevDottedMap =
                new(StringComparer.OrdinalIgnoreCase)
                {
                    ["i.e."] = "that is",
                    ["e.g."] = "for example",
                    ["a.m."] = "ay em",
                    ["p.m."] = "pee em",
                    ["p.s."] = "postscript",
                    ["w.r.t."] = "with regard to",
                    ["b.c."] = "bee see",
                    ["a.d."] = "ay dee",
                };

            static readonly Regex ReAbbrev = new(
                @"\b(Dr|Mr|Mrs|Ms|Prof|Jr|Sr|Vs|Etc|St|Ave|Blvd|Rd|Ln"
              + @"|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec"
              + @"|Lt|Cpt|Capt|Gen|Sgt|Pvt|Col|Maj|Rev|Dept|Inc|Corp|Approx"
              + @"|Max|Min|Avg|Govt|Vol|Fig|Ref|Intl|Natl|Div|Asst|Mgr|Dir"
              + @"|Assoc|Admin|Est|Cont|Abbr|Attr|Dist|Pop|Temp|Tech|Elec)\.",
                RegexOptions.Compiled | RegexOptions.IgnoreCase);

            static readonly Dictionary<string, string> AbbrevMap =
                new(StringComparer.OrdinalIgnoreCase)
                {
                    // Titles
                    ["Dr"]    = "Doctor",
                    ["Mr"]    = "Mister",
                    ["Mrs"]   = "Missus",
                    ["Ms"]    = "Miss",
                    ["Prof"]  = "Professor",
                    ["Jr"]    = "Junior",
                    ["Sr"]    = "Senior",
                    // Common
                    ["Vs"]    = "versus",
                    ["Etc"]   = "etcetera",
                    ["Approx"]= "approximately",
                    ["Max"]   = "maximum",
                    ["Min"]   = "minimum",
                    ["Avg"]   = "average",
                    ["Vol"]   = "volume",
                    ["Fig"]   = "figure",
                    ["Ref"]   = "reference",
                    ["Est"]   = "established",
                    ["Cont"]  = "continued",
                    ["Abbr"]  = "abbreviation",
                    ["Attr"]  = "attributed",
                    ["Dist"]  = "district",
                    ["Pop"]   = "population",
                    ["Temp"]  = "temperature",
                    ["Tech"]  = "technical",
                    ["Elec"]  = "electric",
                    // Addresses
                    ["St"]    = "Street",
                    ["Ave"]   = "Avenue",
                    ["Blvd"]  = "Boulevard",
                    ["Rd"]    = "Road",
                    ["Ln"]    = "Lane",
                    // Military / ranks
                    ["Lt"]    = "Lieutenant",
                    ["Cpt"]   = "Captain",
                    ["Capt"]  = "Captain",
                    ["Gen"]   = "General",
                    ["Sgt"]   = "Sergeant",
                    ["Pvt"]   = "Private",
                    ["Col"]   = "Colonel",
                    ["Maj"]   = "Major",
                    ["Rev"]   = "Reverend",
                    // Org
                    ["Dept"]  = "Department",
                    ["Inc"]   = "Incorporated",
                    ["Corp"]  = "Corporation",
                    ["Govt"]  = "government",
                    ["Div"]   = "division",
                    ["Intl"]  = "international",
                    ["Natl"]  = "national",
                    ["Assoc"] = "association",
                    ["Admin"] = "administration",
                    ["Asst"]  = "assistant",
                    ["Mgr"]   = "manager",
                    ["Dir"]   = "director",
                    // Months
                    ["Jan"]   = "January",
                    ["Feb"]   = "February",
                    ["Mar"]   = "March",
                    ["Apr"]   = "April",
                    ["Jun"]   = "June",
                    ["Jul"]   = "July",
                    ["Aug"]   = "August",
                    ["Sep"]   = "September",
                    ["Sept"]  = "September",
                    ["Oct"]   = "October",
                    ["Nov"]   = "November",
                    ["Dec"]   = "December",
                };

            static readonly string[] DigitWords =
                new string[] { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine" };
            static readonly string[] TeenWords =
                new string[] { "ten","eleven","twelve","thirteen","fourteen","fifteen",
                               "sixteen","seventeen","eighteen","nineteen" };

            static string SmallCardinal(int n)
            {
                if (n == 0)  return "zero";
                if (n < 10)  return DigitWords[n];
                if (n < 20)  return TeenWords[n - 10];
                int t = n / 10, o = n % 10;
                return TensWords[t] + (o > 0 ? " " + DigitWords[o] : "");
            }

            static string YearToWords(int y)
            {
                int hi = y / 100;
                int lo = y % 100;
                if (y == 2000) return "two thousand";
                if (y > 2000 && y < 2010) return "two thousand " + SmallCardinal(lo);
                string hiPart = SmallCardinal(hi);
                if (lo == 0)  return hiPart + " hundred";
                if (lo < 10)  return hiPart + " oh " + SmallCardinal(lo);
                return hiPart + " " + SmallCardinal(lo);
            }

            static readonly string[] OnesOrd = new string[]
            {
            "zeroth","first","second","third","fourth","fifth","sixth","seventh",
            "eighth","ninth","tenth","eleventh","twelfth","thirteenth","fourteenth",
            "fifteenth","sixteenth","seventeenth","eighteenth","nineteenth",
            };
            static readonly string[] TensOrd = new string[]
                {"","","twentieth","thirtieth","fortieth","fiftieth",
             "sixtieth","seventieth","eightieth","ninetieth"};
            static readonly string[] TensWords = new string[]
                {"","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"};

            static string OrdinalToWord(long n)
            {
                if (n < 0) return n.ToString();
                if (n < 20) return OnesOrd[n];
                if (n < 100)
                {
                    int t = (int)(n / 10), o = (int)(n % 10);
                    return o == 0 ? TensOrd[t] : TensWords[t] + " " + OnesOrd[o];
                }
                return n.ToString(); // cardinal fallback for 100+ (rare as ordinal)
            }

            public static string Normalize(string text)
            {
                // 0. Split CamelCase/PascalCase so "SharpTalk" -> "Sharp Talk"
                text = CamelSplit.Replace(text, " ");

                // 1. Currency — before decimal so $3.99 isn't split at the dot
                text = ReCurrency.Replace(text, m =>
                {
                    long dollars = long.Parse(m.Groups[1].Value);
                    string r = dollars + " dollar" + (dollars == 1 ? "" : "s");
                    if (m.Groups[2].Success)
                    {
                        string cs = m.Groups[2].Value.PadRight(2, '0')[..2];
                        long cents = long.Parse(cs);
                        if (cents > 0)
                            r += " and " + cents + " cent" + (cents == 1 ? "" : "s");
                    }
                    return r;
                });

                // 2. Percentages
                text = RePercent.Replace(text, m => m.Groups[1].Value + " percent");

                // 3. Ordinals — before decimals to avoid "1.5th" oddities
                text = ReOrdinal.Replace(text, m => OrdinalToWord(long.Parse(m.Groups[1].Value)));

                // 4. Years — 4-digit numbers read as pairs ("nineteen eighty-four")
                text = ReYear.Replace(text, m => YearToWords(int.Parse(m.Value)));

                // 5. Decimal numbers — spell each digit after the point individually
                text = ReDecimal.Replace(text, m =>
                {
                    string r = m.Groups[1].Value + " point";
                    foreach (char c in m.Groups[2].Value)
                        r += " " + DigitWords[c - '0'];
                    return r;
                });

                // 6. Dotted abbreviations (i.e., e.g., a.m. …) — must run before step 7
                //    so their embedded periods don't trigger sentence splitting.
                text = ReAbbrevDotted.Replace(text, m =>
                    AbbrevDottedMap.TryGetValue(m.Value, out var v) ? v : m.Value);

                // 7. Single-dot abbreviations
                text = ReAbbrev.Replace(text, m => AbbrevMap[m.Groups[1].Value]);

                // 8. Em-dash, en-dash, double-hyphen -> sentence break; plain hyphens -> space
                text = text.Replace("—", ". ").Replace("–", ". ").Replace("--", ". ");
                text = text.Replace('-', ' ');

                // 9. Expressive reduplication: "hahaha" -> "ha ha ha"
                text = ReReduplicate.Replace(text, m => {
                    string unit = m.Groups[1].Value;
                    int count = m.Value.Length / unit.Length;
                    return string.Join(" ", System.Linq.Enumerable.Repeat(unit, count));
                });

                return text;
            }
        }

        // Word -> raw phoneme stream

        // Hardcoded letter pronunciations — A-Z indexed by (char - 'A').
        // Stress marker 56=STRESS1 placed immediately before the stressed vowel.
        // Never routed through dict or LTS so missing entries can't break them.
        static readonly byte[][] LetterPhonemes =
        {
            new byte[]{ 56,10 },                       // A  -> EY
            new byte[]{ 45,56, 0 },                    // B  -> B IY
            new byte[]{ 40,56, 0 },                    // C  -> S IY
            new byte[]{ 47,56, 0 },                    // D  -> D IY
            new byte[]{ 56, 0 },                       // E  -> IY
            new byte[]{ 56, 2,36 },                    // F  -> EH F
            new byte[]{ 51,56, 0 },                    // G  -> JH IY
            new byte[]{ 56,10,50 },                    // H  -> EY CH  (aitch)
            new byte[]{ 56,11 },                       // I  -> AY
            new byte[]{ 51,56,10 },                    // J  -> JH EY
            new byte[]{ 48,56,10 },                    // K  -> K EY
            new byte[]{ 56, 2,31 },                    // L  -> EH L
            new byte[]{ 56, 2,33 },                    // M  -> EH M
            new byte[]{ 56, 2,34 },                    // N  -> EH N
            new byte[]{ 56,14 },                       // O  -> OW
            new byte[]{ 44,56, 0 },                    // P  -> P IY
            new byte[]{ 48,29,56,15 },                 // Q  -> K Y UW  (cue)
            new byte[]{ 56, 4,30 },                    // R  -> AA R
            new byte[]{ 56, 2,40 },                    // S  -> EH S
            new byte[]{ 46,56, 0 },                    // T  -> T IY
            new byte[]{ 29,56,15 },                    // U  -> Y UW
            new byte[]{ 37,56, 0 },                    // V  -> V IY
            new byte[]{ 47,56, 5,45, 8,31,29,56,15 }, // W  -> D AH B AX L Y UW  (double-you)
            new byte[]{ 56, 2,48,40 },                 // X  -> EH K S
            new byte[]{ 28,56,11 },                    // Y  -> W AY
            new byte[]{ 41,56, 0 },                    // Z  -> Z IY
        };

        // Phoneme sequences for every word the normalizer can produce.
        // Checked before dict + LTS so dictionary swaps never affect normalizer output.
        static readonly Dictionary<string, byte[]> NormWords = new()
        {
            //    Digits   
            ["ZERO"] = new byte[]{ 0x29,0x38,0x01,0x1E,0x0E },
            ["ONE"]  = new byte[]{ 0x1C,0x38,0x05,0x22 },
            ["TWO"]  = new byte[]{ 0x2E,0x38,0x0F },
            ["THREE"]= new byte[]{ 0x26,0x1E,0x38,0x00 },
            ["FOUR"] = new byte[]{ 0x24,0x38,0x06,0x1E },
            ["FIVE"] = new byte[]{ 0x24,0x38,0x0B,0x25 },
            ["SIX"]  = new byte[]{ 0x28,0x38,0x01,0x30,0x28 },
            ["SEVEN"]= new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22 },
            ["EIGHT"]= new byte[]{ 0x38,0x0A,0x2E },
            ["NINE"] = new byte[]{ 0x22,0x38,0x0B,0x22 },
            //    Teens   
            ["TEN"]      = new byte[]{ 0x2E,0x38,0x02,0x22 },
            ["ELEVEN"]   = new byte[]{ 0x16,0x1F,0x38,0x02,0x25,0x08,0x22 },
            ["TWELVE"]   = new byte[]{ 0x2E,0x1C,0x38,0x02,0x1F,0x25 },
            ["THIRTEEN"] = new byte[]{ 0x26,0x38,0x09,0x2E,0x38,0x00,0x22 },
            ["FOURTEEN"] = new byte[]{ 0x24,0x38,0x06,0x1E,0x2E,0x38,0x00,0x22 },
            ["FIFTEEN"]  = new byte[]{ 0x24,0x16,0x24,0x2E,0x38,0x00,0x22 },
            ["SIXTEEN"]  = new byte[]{ 0x28,0x16,0x30,0x28,0x2E,0x38,0x00,0x22 },
            ["SEVENTEEN"]= new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22,0x2E,0x38,0x00,0x22 },
            ["EIGHTEEN"] = new byte[]{ 0x0A,0x2E,0x38,0x00,0x22 },
            ["NINETEEN"] = new byte[]{ 0x22,0x38,0x0B,0x22,0x2E,0x38,0x00,0x22 },
            //    Tens   
            ["TWENTY"] = new byte[]{ 0x2E,0x1C,0x38,0x02,0x22,0x2E,0x00 },
            ["THIRTY"] = new byte[]{ 0x26,0x38,0x09,0x2F,0x39,0x00 },
            ["FORTY"]  = new byte[]{ 0x24,0x38,0x06,0x1E,0x2E,0x00 },
            ["FIFTY"]  = new byte[]{ 0x24,0x38,0x01,0x24,0x2E,0x00 },
            ["SIXTY"]  = new byte[]{ 0x28,0x38,0x01,0x30,0x28,0x2E,0x00 },
            ["SEVENTY"]= new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22,0x2E,0x00 },
            ["EIGHTY"] = new byte[]{ 0x38,0x0A,0x2E,0x00 },
            ["NINETY"] = new byte[]{ 0x22,0x38,0x0B,0x22,0x2E,0x00 },
            //    Large / misc number   
            ["HUNDRED"]  = new byte[]{ 0x20,0x38,0x05,0x22,0x2F,0x1E,0x08,0x2F },
            ["THOUSAND"] = new byte[]{ 0x26,0x38,0x0D,0x29,0x08,0x22,0x2F },
            ["MILLION"]  = new byte[]{ 0x21,0x38,0x01,0x1F,0x1D,0x08,0x22 },
            ["BILLION"]  = new byte[]{ 0x2D,0x38,0x01,0x1F,0x1D,0x08,0x22 },
            ["OH"]       = new byte[]{ 0x38,0x0E },
            ["POINT"]    = new byte[]{ 0x2C,0x38,0x0C,0x22,0x2E },
            ["AND"]      = new byte[]{ 0x08,0x22,0x2F },
            //    Currency / percent   
            ["DOLLAR"]  = new byte[]{ 0x2F,0x38,0x04,0x1F,0x09 },
            ["DOLLARS"] = new byte[]{ 0x2F,0x38,0x04,0x1F,0x09,0x29 },
            ["CENT"]    = new byte[]{ 0x28,0x38,0x02,0x22,0x2E },
            ["CENTS"]   = new byte[]{ 0x28,0x38,0x02,0x22,0x2E,0x28 },
            ["PERCENT"] = new byte[]{ 0x2C,0x09,0x28,0x38,0x02,0x22,0x2E },
            //    Ordinals   
            ["ZEROTH"]     = new byte[]{ 0x29,0x38,0x00,0x1E,0x0E,0x26 },
            ["FIRST"]      = new byte[]{ 0x24,0x38,0x09,0x28,0x2E },
            ["SECOND"]     = new byte[]{ 0x28,0x38,0x02,0x30,0x08,0x22,0x2F },
            ["THIRD"]      = new byte[]{ 0x26,0x38,0x09,0x2F },
            ["FOURTH"]     = new byte[]{ 0x24,0x38,0x06,0x1E,0x26 },
            ["FIFTH"]      = new byte[]{ 0x24,0x38,0x01,0x24,0x26 },
            ["SIXTH"]      = new byte[]{ 0x28,0x38,0x01,0x30,0x28,0x26 },
            ["SEVENTH"]    = new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22,0x26 },
            ["EIGHTH"]     = new byte[]{ 0x38,0x0A,0x2E,0x26 },
            ["NINTH"]      = new byte[]{ 0x22,0x38,0x0B,0x22,0x26 },
            ["TENTH"]      = new byte[]{ 0x2E,0x38,0x02,0x22,0x26 },
            ["ELEVENTH"]   = new byte[]{ 0x16,0x1F,0x38,0x02,0x25,0x08,0x22,0x26 },
            ["TWELFTH"]    = new byte[]{ 0x2E,0x1C,0x38,0x02,0x1F,0x24,0x26 },
            ["THIRTEENTH"] = new byte[]{ 0x26,0x38,0x09,0x2E,0x38,0x00,0x22,0x26 },
            ["FOURTEENTH"] = new byte[]{ 0x24,0x38,0x06,0x1E,0x2E,0x38,0x00,0x22,0x26 },
            ["FIFTEENTH"]  = new byte[]{ 0x24,0x16,0x24,0x2E,0x38,0x00,0x22,0x26 },
            ["SIXTEENTH"]  = new byte[]{ 0x28,0x16,0x30,0x28,0x2E,0x38,0x00,0x22,0x26 },
            ["SEVENTEENTH"]= new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22,0x2E,0x38,0x00,0x22,0x26 },
            ["EIGHTEENTH"] = new byte[]{ 0x0A,0x2E,0x38,0x00,0x22,0x26 },
            ["NINETEENTH"] = new byte[]{ 0x22,0x38,0x0B,0x22,0x2E,0x38,0x00,0x22,0x26 },
            ["TWENTIETH"]  = new byte[]{ 0x2E,0x1C,0x38,0x02,0x22,0x2E,0x00,0x08,0x26 },
            ["THIRTIETH"]  = new byte[]{ 0x26,0x38,0x09,0x2E,0x00,0x08,0x26 },
            ["FORTIETH"]   = new byte[]{ 0x24,0x38,0x06,0x1E,0x2E,0x00,0x16,0x26 },
            ["FIFTIETH"]   = new byte[]{ 0x24,0x38,0x01,0x24,0x2E,0x00,0x16,0x26 },
            ["SIXTIETH"]   = new byte[]{ 0x28,0x38,0x01,0x30,0x28,0x2E,0x00,0x16,0x26 },
            ["SEVENTIETH"] = new byte[]{ 0x28,0x38,0x02,0x25,0x08,0x22,0x2E,0x00,0x16,0x26 },
            ["EIGHTIETH"]  = new byte[]{ 0x38,0x0A,0x2E,0x00,0x16,0x26 },
            ["NINETIETH"]  = new byte[]{ 0x22,0x38,0x0B,0x22,0x2E,0x00,0x16,0x26 },
            //    Letter names (used by dotted abbreviation expansions)   
            ["AY"]  = new byte[]{ 0x38,0x0A },
            ["BEE"] = new byte[]{ 0x2D,0x38,0x00 },
            ["SEE"] = new byte[]{ 0x28,0x38,0x00 },
            ["DEE"] = new byte[]{ 0x2F,0x38,0x00 },
            ["EF"]  = new byte[]{ 0x38,0x02,0x24 },
            ["EM"]  = new byte[]{ 0x38,0x02,0x21 },
            ["PEE"] = new byte[]{ 0x2C,0x38,0x00 },
            //    Dotted abbreviation expansions   
            ["THAT"]      = new byte[]{ 0x27,0x38,0x03,0x2E },
            ["IS"]        = new byte[]{ 0x38,0x01,0x29 },
            ["FOR"]       = new byte[]{ 0x24,0x38,0x06,0x1E },
            ["EXAMPLE"]   = new byte[]{ 0x16,0x31,0x29,0x38,0x03,0x21,0x2C,0x08,0x1F },
            ["POSTSCRIPT"]= new byte[]{ 0x2C,0x38,0x0E,0x28,0x30,0x1E,0x39,0x01,0x2C,0x2E },
            ["WITH"]      = new byte[]{ 0x1C,0x38,0x01,0x27 },
            ["REGARD"]    = new byte[]{ 0x1E,0x16,0x31,0x38,0x04,0x1E,0x2F },
            ["TO"]        = new byte[]{ 0x2E,0x38,0x0F },
            //    Titles   
            ["DOCTOR"]    = new byte[]{ 0x2F,0x38,0x04,0x30,0x2E,0x09 },
            ["MISTER"]    = new byte[]{ 0x21,0x38,0x01,0x28,0x2E,0x09 },
            ["MISSUS"]    = new byte[]{ 0x21,0x38,0x01,0x28,0x16,0x29 },
            ["MISS"]      = new byte[]{ 0x21,0x38,0x01,0x28 },
            ["PROFESSOR"] = new byte[]{ 0x2C,0x1E,0x08,0x24,0x38,0x02,0x28,0x09 },
            ["JUNIOR"]    = new byte[]{ 0x33,0x38,0x0F,0x22,0x1D,0x09 },
            ["SENIOR"]    = new byte[]{ 0x28,0x38,0x00,0x22,0x1D,0x09 },
            //    Common abbreviation expansions   
            ["VERSUS"]        = new byte[]{ 0x25,0x38,0x09,0x28,0x08,0x28 },
            ["ETCETERA"]      = new byte[]{ 0x38,0x02,0x2E,0x28,0x38,0x02,0x2E,0x09,0x08 },
            ["APPROXIMATELY"] = new byte[]{ 0x08,0x2C,0x1E,0x38,0x04,0x30,0x28,0x08,0x21,0x08,0x2E,0x1F,0x00 },
            ["MAXIMUM"]       = new byte[]{ 0x21,0x38,0x03,0x30,0x28,0x08,0x21,0x08,0x21 },
            ["MINIMUM"]       = new byte[]{ 0x21,0x38,0x01,0x22,0x08,0x21,0x08,0x21 },
            ["AVERAGE"]       = new byte[]{ 0x38,0x03,0x25,0x09,0x16,0x33 },
            ["VOLUME"]        = new byte[]{ 0x25,0x38,0x04,0x1F,0x1D,0x0F,0x21 },
            ["FIGURE"]        = new byte[]{ 0x24,0x38,0x01,0x31,0x1D,0x09 },
            ["REFERENCE"]     = new byte[]{ 0x1E,0x38,0x02,0x24,0x09,0x08,0x22,0x28 },
            ["ESTABLISHED"]   = new byte[]{ 0x16,0x28,0x2E,0x38,0x03,0x2D,0x1F,0x16,0x2A,0x2E },
            ["CONTINUED"]     = new byte[]{ 0x30,0x08,0x22,0x2E,0x38,0x01,0x22,0x1D,0x0F,0x2F },
            ["ABBREVIATION"]  = new byte[]{ 0x08,0x2D,0x1E,0x39,0x00,0x25,0x00,0x38,0x0A,0x2A,0x08,0x22 },
            ["ATTRIBUTED"]    = new byte[]{ 0x08,0x2E,0x1E,0x38,0x01,0x2D,0x1D,0x08,0x2E,0x16,0x2F },
            ["DISTRICT"]      = new byte[]{ 0x2F,0x38,0x01,0x28,0x2E,0x1E,0x16,0x30,0x2E },
            ["POPULATION"]    = new byte[]{ 0x2C,0x39,0x04,0x2C,0x1D,0x08,0x1F,0x38,0x0A,0x2A,0x08,0x22 },
            ["TEMPERATURE"]   = new byte[]{ 0x2E,0x38,0x02,0x21,0x2C,0x1E,0x08,0x32,0x09 },
            ["TECHNICAL"]     = new byte[]{ 0x2E,0x38,0x02,0x30,0x22,0x16,0x30,0x08,0x1F },
            ["ELECTRIC"]      = new byte[]{ 0x16,0x1F,0x38,0x02,0x30,0x2E,0x1E,0x16,0x30 },
            //    Address   
            ["STREET"]    = new byte[]{ 0x28,0x2E,0x1E,0x38,0x00,0x2E },
            ["AVENUE"]    = new byte[]{ 0x38,0x03,0x25,0x08,0x22,0x39,0x0F },
            ["BOULEVARD"] = new byte[]{ 0x2D,0x38,0x07,0x1F,0x08,0x25,0x39,0x04,0x1E,0x2F },
            ["ROAD"]      = new byte[]{ 0x1E,0x38,0x0E,0x2F },
            ["LANE"]      = new byte[]{ 0x1F,0x38,0x0A,0x22 },
            //    Military   
            ["LIEUTENANT"] = new byte[]{ 0x1F,0x0F,0x2E,0x38,0x02,0x22,0x08,0x22,0x2E },
            ["CAPTAIN"]    = new byte[]{ 0x30,0x38,0x03,0x2C,0x2E,0x08,0x22 },
            ["GENERAL"]    = new byte[]{ 0x33,0x38,0x02,0x22,0x09,0x08,0x1F },
            ["SERGEANT"]   = new byte[]{ 0x28,0x38,0x04,0x1E,0x33,0x08,0x22,0x2E },
            ["PRIVATE"]    = new byte[]{ 0x2C,0x1E,0x38,0x0B,0x25,0x08,0x2E },
            ["COLONEL"]    = new byte[]{ 0x30,0x38,0x09,0x22,0x08,0x1F },
            ["MAJOR"]      = new byte[]{ 0x21,0x38,0x0A,0x33,0x09 },
            ["REVEREND"]   = new byte[]{ 0x1E,0x38,0x02,0x25,0x09,0x08,0x22,0x2F },
            //    Org   
            ["DEPARTMENT"]    = new byte[]{ 0x2F,0x16,0x2C,0x38,0x04,0x1E,0x2E,0x21,0x08,0x22,0x2E },
            ["INCORPORATED"]  = new byte[]{ 0x39,0x01,0x22,0x30,0x38,0x06,0x1E,0x2C,0x09,0x39,0x0A,0x2E,0x16,0x2F },
            ["CORPORATION"]   = new byte[]{ 0x30,0x39,0x06,0x1E,0x2C,0x09,0x38,0x0A,0x2A,0x08,0x22 },
            ["GOVERNMENT"]    = new byte[]{ 0x31,0x38,0x05,0x25,0x09,0x21,0x08,0x22,0x2E },
            ["DIVISION"]      = new byte[]{ 0x2F,0x16,0x25,0x38,0x01,0x2B,0x08,0x22 },
            ["INTERNATIONAL"] = new byte[]{ 0x39,0x01,0x22,0x2E,0x09,0x22,0x38,0x03,0x2A,0x08,0x22,0x08,0x1F },
            ["NATIONAL"]      = new byte[]{ 0x22,0x38,0x03,0x2A,0x08,0x22,0x08,0x1F },
            ["ASSOCIATION"]   = new byte[]{ 0x08,0x28,0x39,0x0E,0x28,0x00,0x38,0x0A,0x2A,0x08,0x22 },
            ["ADMINISTRATION"]= new byte[]{ 0x03,0x2F,0x21,0x39,0x01,0x22,0x16,0x28,0x2E,0x1E,0x38,0x0A,0x2A,0x08,0x22 },
            ["ASSISTANT"]     = new byte[]{ 0x08,0x28,0x38,0x01,0x28,0x2E,0x08,0x22,0x2E },
            ["MANAGER"]       = new byte[]{ 0x21,0x38,0x03,0x22,0x08,0x33,0x09 },
            ["DIRECTOR"]      = new byte[]{ 0x2F,0x09,0x38,0x02,0x30,0x2E,0x09 },
            //    Months   
            ["JANUARY"]  = new byte[]{ 0x33,0x38,0x03,0x22,0x1D,0x0F,0x39,0x02,0x1E,0x00 },
            ["FEBRUARY"] = new byte[]{ 0x24,0x38,0x02,0x2D,0x1D,0x08,0x1C,0x39,0x02,0x1E,0x00 },
            ["MARCH"]    = new byte[]{ 0x21,0x38,0x04,0x1E,0x32 },
            ["APRIL"]    = new byte[]{ 0x38,0x0A,0x2C,0x1E,0x08,0x1F },
            ["JUNE"]     = new byte[]{ 0x33,0x38,0x0F,0x22 },
            ["JULY"]     = new byte[]{ 0x33,0x39,0x0F,0x1F,0x38,0x0B },
            ["AUGUST"]   = new byte[]{ 0x38,0x04,0x31,0x08,0x28,0x2E },
            ["SEPTEMBER"]= new byte[]{ 0x28,0x02,0x2C,0x2E,0x38,0x02,0x21,0x2D,0x09 },
            ["OCTOBER"]  = new byte[]{ 0x04,0x30,0x2E,0x38,0x0E,0x2D,0x09 },
            ["NOVEMBER"] = new byte[]{ 0x22,0x0E,0x25,0x38,0x02,0x21,0x2D,0x09 },
            ["DECEMBER"] = new byte[]{ 0x2F,0x16,0x28,0x38,0x02,0x21,0x2D,0x09 },
        };

        // For all-caps words absent from the dict, inject letter phonemes directly
        // no dict lookup, no LTS. Each letter becomes its own word-boundary token.
        byte[] SpellOutAcronym(string upper)
        {
            var buf = new System.Collections.Generic.List<byte>(upper.Length * 4);
            foreach (char c in upper)
            {
                if (c < 'A' || c > 'Z') continue;
                buf.Add(OP_WORD);
                buf.AddRange(LetterPhonemes[c - 'A']);
            }
            return buf.ToArray();
        }

        static bool IsAllCaps(string word)
        {
            if (word.Length < 2) return false;
            foreach (char c in word) if (c < 'A' || c > 'Z') return false;
            return true;
        }

        byte[] WordToPhonStream(string upperWord)
        {
            // Contractions are stored in the dict without apostrophes ("ISN'T" -> "ISNT").
            string lookupWord = upperWord.Contains('\'')
                ? upperWord.Replace("'", "") : upperWord;

            // 0. Normalizer word table — bypasses dict entirely
            if (NormWords.TryGetValue(upperWord, out var normPhons))
            {
                var nb = new byte[normPhons.Length + 1];
                nb[0] = OP_WORD; normPhons.CopyTo(nb, 1);
                return nb;
            }

            // 1. Try dictionary directly
            byte[]? phons = _dict.Search(lookupWord);
            if (phons != null) StatDict++;

            // 2. Try morphological decomposition (suffix stripping + root lookup)
            if (phons == null) { phons = Morph.TryDecompose(lookupWord, _dict); if (phons != null) StatMorph++; }

            // 3. Fall back to letter-to-sound rules
            if (phons == null) { phons = LetterToSound.Convert(upperWord); StatLts++; }

            // Prepend OP_WORD marker
            var buf = new byte[phons.Length + 1];
            buf[0] = OP_WORD;
            phons.CopyTo(buf, 1);
            return buf;
        }

        // Number -> raw phoneme stream

        byte[] NumberToPhonStream(long n)
        {
            var buf = new List<byte>();
            BuildNumberPhons(buf, n);
            return buf.ToArray();
        }

        void BuildNumberPhons(List<byte> buf, long n)
        {
            if (n < 0) { AppendSymbol(buf, "1E3"); BuildNumberPhons(buf, -n); return; } // "minus" via billion slot TODO: add MINUS to symbols
            if (n == 0) { AppendSymbol(buf, "0"); return; }

            if (n >= 1_000_000_000)
            {
                BuildNumberPhons(buf, n / 1_000_000_000);
                AppendSymbol(buf, "1E3");  // billion
                n %= 1_000_000_000;
            }
            if (n >= 1_000_000)
            {
                BuildNumberPhons(buf, n / 1_000_000);
                AppendSymbol(buf, "1E2");  // million
                n %= 1_000_000;
            }
            if (n >= 1_000)
            {
                BuildNumberPhons(buf, n / 1_000);
                AppendSymbol(buf, "1E1");  // thousand
                n %= 1_000;
            }
            if (n >= 100)
            {
                AppendDigit(buf, (int)(n / 100));
                AppendSymbol(buf, "100");  // hundred
                n %= 100;
            }
            if (n >= 20)
            {
                AppendTens(buf, (int)(n / 10));
                n %= 10;
                if (n > 0) AppendDigit(buf, (int)n);
            }
            else if (n >= 10)
            {
                AppendTeen(buf, (int)n);
            }
            else if (n > 0)
            {
                AppendDigit(buf, (int)n);
            }
        }

        static readonly string[] DigitNames = new string[] { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" };
        static readonly string[] TeenNames = new string[] { "10", "11", "12", "13", "14", "15", "16", "17", "18", "19" };
        static readonly string[] TensNames = new string[] { "", "", "20", "30", "40", "50", "60", "70", "80", "90" };

        void AppendDigit(List<byte> buf, int d) => AppendSymbol(buf, DigitNames[d]);
        void AppendTeen(List<byte> buf, int n) => AppendSymbol(buf, TeenNames[n - 10]);
        void AppendTens(List<byte> buf, int t) => AppendSymbol(buf, TensNames[t]);

        void AppendSymbol(List<byte> buf, string sym)
        {
            if (buf.Count == 0) buf.Add(OP_WORD);
            byte[]? phons = _symbols.Search(sym);
            if (phons == null) return;
            buf.AddRange(phons);
        }

        // Stream -> PhonemeToken list

        void AppendWordTokens(List<PhonemeToken> tokens, byte[] stream, bool isContent)
        {
            long pending = 0;
            int startIdx = tokens.Count;
            bool hadPrimary = false;

            foreach (byte b in stream)
            {
                switch (b)
                {
                    case OP_WORD:
                        pending |= kWord_Start;
                        if (isContent) pending |= kContent_Word;
                        break;
                    case OP_STRESS1:
                        // Function words: demote dict primary stress to secondary so they
                        // don't trigger pitch peaks in the BackEnd pitch algorithm.
                        if (isContent) { pending |= kPrimaryStress; hadPrimary = true; }
                        else pending |= kSecondaryStress;
                        break;
                    case OP_STRESS2: pending |= kSecondaryStress; break;
                    case OP_EMPHSTRESS: pending |= kEmphaticStress; break;
                    case OP_SYLL: pending |= kSyllable_Start; break;
                    case OP_PREP: pending |= kPrep_Start; break;
                    case OP_VERB: pending |= kVerb_Start; break;
                    case OP_COMMA:
                    case OP_PERIOD:
                    case OP_QUEST:
                    case OP_EXCLAM:
                        tokens.Add(new PhonemeToken { Phon = (short)b, Ctrl = kTerm_Bound });
                        pending = 0;
                        break;
                    default:
                        if (b <= 55)
                        {
                            tokens.Add(new PhonemeToken { Phon = (short)b, Ctrl = pending });
                            pending = 0;
                        }
                        break;
                }
            }

            // Content word with only secondary stress: promote to primary so the pitch
            // algorithm has a peak to work with on words like "how".
            if (isContent && !hadPrimary)
            {
                for (int i = startIdx; i < tokens.Count; i++)
                {
                    if ((tokens[i].Ctrl & kSecondaryStress) != 0)
                    {
                        tokens[i] = new PhonemeToken
                        {
                            Phon = tokens[i].Phon,
                            Ctrl = (tokens[i].Ctrl & ~kSecondaryStress) | kPrimaryStress,
                            UserPitch = tokens[i].UserPitch,
                            UserDur = tokens[i].UserDur,
                            UserNote = tokens[i].UserNote,
                            UserRate = tokens[i].UserRate,
                        };
                        break;
                    }
                }
            }
        }

    }
}  // namespace