s&box

Phonemizer.cs
#nullable enable
using System;
using System.Collections.Generic;
#if !SANDBOX
#endif
using System.Text.RegularExpressions;
using static SharpTalk.AudioProcessor;
using static SharpTalk.Phonemizer.Normalizer;

namespace SharpTalk
{

    public class Phonemizer
    {
        readonly DictReader _dict;
        readonly DictReader _symbols;

        // Opcodes that are control codes, not actual phonemes (56-72)
        const byte OP_STRESS1 = 56;  // _Stress1_  → kPrimaryStress
        const byte OP_STRESS2 = 57;  // _Stress2_  → kSecondaryStress
        const byte OP_EMPHSTRESS = 58;  // _EmphStress_ → kEmphaticStress
        const byte OP_SYLL = 63;  // _Syll_     → kSyllable_Start
        const byte OP_WORD = 64;  // _Word_     → kWord_Start
        const byte OP_PREP = 65;  // _Prep_     → kPrep_Start
        const byte OP_VERB = 66;  // _Verb_     → kVerb_Start
        const byte OP_COMMA = 67;  // _Comma_
        const byte OP_PERIOD = 68;  // _Period_
        const byte OP_QUEST = 69;  // _Quest_
        const byte OP_EXCLAM = 70;  // _Exclam_

        // Function words — do NOT receive kContent_Word; primary dict stress is
        // suppressed so they don't drive pitch peaks in the BackEnd pitch algorithm.
        // Mirrors POS-based content/function distinction.
        static readonly HashSet<string> FuncWords = new(StringComparer.OrdinalIgnoreCase)
    {
        // articles / determiners
        "a", "an", "the",
        // prepositions
        "of", "in", "on", "at", "by", "for", "to", "up", "as", "into",
        "from", "with", "about", "over", "under", "out", "off", "than",
        // coordinating conjunctions
        "and", "or", "but", "nor", "yet", "so",
        // subordinating conjunctions
        "if", "that", "than", "when", "while", "because", "though",
        "although", "unless", "until", "since", "after", "before",
        // auxiliaries & copula
        "be", "am", "is", "are", "was", "were", "been", "being",
        "have", "has", "had", "do", "does", "did",
        "will", "would", "could", "should", "may", "might", "shall",
        "can", "must", "ought",
        // subject / object pronouns
        "i", "he", "she", "we", "they", "you", "it",
        "me", "him", "her", "us", "them",
        // possessive determiners
        "my", "your", "his", "its", "our", "their",
        // other function words
        "not", "no", "there", "here",
    };

        static readonly Regex TokenRe = new(
            @"(\d+)|([a-zA-Z]+(?:'[a-zA-Z]+)*)|([,;:])|([.!?])|(\s+)",
            RegexOptions.Compiled);

        static readonly Regex CamelSplit = new(
            @"(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])",
            RegexOptions.Compiled);

#if !SANDBOX
#endif

        public Phonemizer(byte[] dictData, byte[] symbolsData)
        {
            _dict = new DictReader(dictData);
            _symbols = new DictReader(symbolsData);
        }

#if !SANDBOX
#endif

        public short LastEndPunct { get; private set; } = _Period_;

        public (PhonemeToken[] Tokens, short EndPunct)[] TextToSentenceTokens(string text)
        {
            var result = new List<(PhonemeToken[], short)>();
            var segments = EmbeddedCmd.ParseSegments(text);

            foreach (var seg in segments)
            {
                if (seg.IsCommand) continue; // handled by TtsEngine, not FrontEnd

                if (seg.IsSinging)
                {
                    // Each singing block is its own clause — never mix with speech
                    if (seg.Singing!.Count > 0)
                        result.Add((seg.Singing.ToArray(), _Period_));
                    continue;
                }

                // Split at sentence boundaries (.!?) and clause boundaries (,;:).
                // Each clause gets its own BackEnd.Process call so pitch resets cleanly.
                string plain = Normalize(seg.PlainText!);
                int start = 0;
                foreach (Match m in TokenRe.Matches(plain))
                {
                    if (!m.Groups[4].Success && !m.Groups[3].Success) continue;
                    string sentence = plain[start..(m.Index + m.Length)];
                    var tokens = TextSegmentToPhonemes(sentence);
                    result.Add((tokens, LastEndPunct));
                    start = m.Index + m.Length;
                }
                if (start < plain.Length)
                {
                    string remaining = plain[start..];
                    if (remaining.Trim().Length > 0)
                    {
                        var tokens = TextSegmentToPhonemes(remaining);
                        result.Add((tokens, LastEndPunct));
                    }
                }
            }

            if (result.Count == 0)
            {
                var tokens = TextToPhonemes(text);
                result.Add((tokens, LastEndPunct));
            }

            return result.ToArray();
        }

        // Process a pure-text span (no embedded commands) into phoneme tokens.
        private PhonemeToken[] TextSegmentToPhonemes(string text)
        {
            text = Normalize(text);
            var tokens = new List<PhonemeToken>();
            LastEndPunct = _Period_;

            foreach (Match m in TokenRe.Matches(text))
            {
                if (m.Groups[1].Success)
                {
                    if (long.TryParse(m.Groups[1].Value, out long n))
                        AppendWordTokens(tokens, NumberToPhonStream(n), isContent: true);
                }
                else if (m.Groups[2].Success)
                {
                    string word = m.Groups[2].Value;
                    AppendWordTokens(tokens, WordToPhonStream(word.ToUpperInvariant()), !FuncWords.Contains(word));
                }
                else if (m.Groups[3].Success)
                {
                    tokens.Add(new PhonemeToken
                    {
                        Phon = _SIL_,
                        Ctrl = kTerm_Bound | ((long)kBND_Pause << kSilenceTypeShift),
                    });
                    LastEndPunct = _Comma_;
                }
                else if (m.Groups[4].Success)
                {
                    char p = m.Groups[4].Value[0];
                    LastEndPunct = p == '?' ? _Quest_ : p == '!' ? _Exclam_ : _Period_;
                }
            }

            return tokens.ToArray();
        }

        public PhonemeToken[] TextToPhonemes(string text)
        {
            var tokens = new List<PhonemeToken>();
            LastEndPunct = _Period_;

            // Split into ordered segments (plain text spans interleaved with singing blocks)
            var segments = EmbeddedCmd.ParseSegments(text);

            foreach (var seg in segments)
            {
                if (seg.IsCommand) continue; // handled by TtsEngine, not FrontEnd

                if (seg.IsSinging)
                {
                    tokens.AddRange(seg.Singing!);
                    continue;
                }

                foreach (Match m in TokenRe.Matches(Normalize(seg.PlainText!)))
                {
                    if (m.Groups[1].Success)           // number
                    {
                        if (long.TryParse(m.Groups[1].Value, out long n))
                            AppendWordTokens(tokens, NumberToPhonStream(n), isContent: true);
                    }
                    else if (m.Groups[2].Success)      // word
                    {
                        string word = m.Groups[2].Value;
                        bool isContent = !FuncWords.Contains(word);
                        AppendWordTokens(tokens, WordToPhonStream(word.ToUpperInvariant()), isContent);
                    }
                    else if (m.Groups[3].Success)      // , ;
                    {
                        tokens.Add(new PhonemeToken
                        {
                            Phon = _SIL_,
                            Ctrl = kTerm_Bound | ((long)kBND_Pause << kSilenceTypeShift),
                        });
                        LastEndPunct = _Comma_;
                    }
                    else if (m.Groups[4].Success)      // . ! ?
                    {
                        char p = m.Groups[4].Value[0];
                        LastEndPunct = p == '?' ? _Quest_ : p == '!' ? _Exclam_ : _Period_;
                    }
                    // whitespace: skip
                }
            }

            return tokens.ToArray();
        }

        // Text normalization
        // Nested static class keeps normalizer state (regexes, tables) out of the
        // FrontEnd field list without a separate file.
        internal static class Normalizer
        {
            static readonly Regex ReCurrency = new(
                @"\$\s*(\d+)(?:\.(\d{1,2}))?", RegexOptions.Compiled);
            static readonly Regex RePercent = new(
                @"(\d+)\s*%", RegexOptions.Compiled);
            static readonly Regex ReOrdinal = new(
                @"\b(\d+)\s*(?:st|nd|rd|th)\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
            static readonly Regex ReDecimal = new(
                @"\b(\d+)\.(\d+)\b", RegexOptions.Compiled);
            static readonly Regex ReAbbrev = new(
                @"\b(Dr|Mr|Mrs|Ms|Prof|Jr|Sr|Vs|Etc|St|Ave|Blvd|Rd|Ln"
              + @"|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec"
              + @"|Lt|Cpt|Capt|Gen|Sgt|Pvt|Col|Maj|Rev|Dept|Inc|Corp|Approx)\.",
                RegexOptions.Compiled | RegexOptions.IgnoreCase);

            static readonly Dictionary<string, string> AbbrevMap =
                new(StringComparer.OrdinalIgnoreCase)
                {
                    ["Dr"] = "Doctor",
                    ["Mr"] = "Mister",
                    ["Mrs"] = "Missus",
                    ["Ms"] = "Miss",
                    ["Prof"] = "Professor",
                    ["Jr"] = "Junior",
                    ["Sr"] = "Senior",
                    ["Vs"] = "versus",
                    ["Etc"] = "etcetera",
                    ["St"] = "Saint",
                    ["Ave"] = "Avenue",
                    ["Blvd"] = "Boulevard",
                    ["Rd"] = "Road",
                    ["Ln"] = "Lane",
                    ["Lt"] = "Lieutenant",
                    ["Cpt"] = "Captain",
                    ["Capt"] = "Captain",
                    ["Gen"] = "General",
                    ["Sgt"] = "Sergeant",
                    ["Pvt"] = "Private",
                    ["Col"] = "Colonel",
                    ["Maj"] = "Major",
                    ["Rev"] = "Reverend",
                    ["Dept"] = "Department",
                    ["Inc"] = "Incorporated",
                    ["Corp"] = "Corporation",
                    ["Approx"] = "approximately",
                    ["Jan"] = "January",
                    ["Feb"] = "February",
                    ["Mar"] = "March",
                    ["Apr"] = "April",
                    ["Jun"] = "June",
                    ["Jul"] = "July",
                    ["Aug"] = "August",
                    ["Sep"] = "September",
                    ["Sept"] = "September",
                    ["Oct"] = "October",
                    ["Nov"] = "November",
                    ["Dec"] = "December",
                };

            static readonly string[] DigitWords =
                new string[] { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine" };

            static readonly string[] OnesOrd = new string[]
            {
            "zeroth","first","second","third","fourth","fifth","sixth","seventh",
            "eighth","ninth","tenth","eleventh","twelfth","thirteenth","fourteenth",
            "fifteenth","sixteenth","seventeenth","eighteenth","nineteenth",
            };
            static readonly string[] TensOrd = new string[]
                {"","","twentieth","thirtieth","fortieth","fiftieth",
             "sixtieth","seventieth","eightieth","ninetieth"};
            static readonly string[] TensWords = new string[]
                {"","","twenty","thirty","forty","fifty","sixty","seventy","eighty","ninety"};

            static string OrdinalToWord(long n)
            {
                if (n < 0) return n.ToString();
                if (n < 20) return OnesOrd[n];
                if (n < 100)
                {
                    int t = (int)(n / 10), o = (int)(n % 10);
                    return o == 0 ? TensOrd[t] : TensWords[t] + " " + OnesOrd[o];
                }
                return n.ToString(); // cardinal fallback for 100+ (rare as ordinal)
            }

            public static string Normalize(string text)
            {
                // 0. Split CamelCase/PascalCase so "SharpTalk" → "Sharp Talk"
                text = CamelSplit.Replace(text, " ");

                // 1. Currency — before decimal so $3.99 isn't split at the dot
                text = ReCurrency.Replace(text, m =>
                {
                    long dollars = long.Parse(m.Groups[1].Value);
                    string r = dollars + " dollar" + (dollars == 1 ? "" : "s");
                    if (m.Groups[2].Success)
                    {
                        string cs = m.Groups[2].Value.PadRight(2, '0')[..2];
                        long cents = long.Parse(cs);
                        if (cents > 0)
                            r += " and " + cents + " cent" + (cents == 1 ? "" : "s");
                    }
                    return r;
                });

                // 2. Percentages
                text = RePercent.Replace(text, m => m.Groups[1].Value + " percent");

                // 3. Ordinals — before decimals to avoid "1.5th" oddities
                text = ReOrdinal.Replace(text, m => OrdinalToWord(long.Parse(m.Groups[1].Value)));

                // 4. Decimal numbers — spell each digit after the point individually
                text = ReDecimal.Replace(text, m =>
                {
                    string r = m.Groups[1].Value + " point";
                    foreach (char c in m.Groups[2].Value)
                        r += " " + DigitWords[c - '0'];
                    return r;
                });

                // 5. Abbreviations — expand so their period doesn't trigger sentence split
                text = ReAbbrev.Replace(text, m => AbbrevMap[m.Groups[1].Value]);

                // 6. Hyphens → space (compound words read naturally)
                text = text.Replace('-', ' ');

                return text;
            }
        }

        // Word → raw phoneme stream

        byte[] WordToPhonStream(string upperWord)
        {
            // 1. Try dictionary directly
            byte[]? phons = _dict.Search(upperWord);

            // 2. Try morphological decomposition (suffix stripping + root lookup)
            phons ??= Morph.TryDecompose(upperWord, _dict);

            // 3. Fall back to letter-to-sound rules
            phons ??= EngToP.Convert(upperWord);

            // Prepend OP_WORD marker
            var buf = new byte[phons.Length + 1];
            buf[0] = OP_WORD;
            phons.CopyTo(buf, 1);
            return buf;
        }

        // Number → raw phoneme stream

        byte[] NumberToPhonStream(long n)
        {
            var buf = new List<byte>();
            BuildNumberPhons(buf, n);
            return buf.ToArray();
        }

        void BuildNumberPhons(List<byte> buf, long n)
        {
            if (n < 0) { AppendSymbol(buf, "MINUS"); BuildNumberPhons(buf, -n); return; }
            if (n == 0) { AppendSymbol(buf, "0"); return; }

            if (n >= 1_000_000)
            {
                BuildNumberPhons(buf, n / 1_000_000);
                AppendSymbol(buf, "MILLION");
                n %= 1_000_000;
            }
            if (n >= 1_000)
            {
                BuildNumberPhons(buf, n / 1_000);
                AppendSymbol(buf, "THOUSAND");
                n %= 1_000;
            }
            if (n >= 100)
            {
                AppendDigit(buf, (int)(n / 100));
                AppendSymbol(buf, "HUNDRED");
                n %= 100;
            }
            if (n >= 20)
            {
                AppendTens(buf, (int)(n / 10));
                n %= 10;
                if (n > 0) AppendDigit(buf, (int)n);
            }
            else if (n >= 10)
            {
                AppendTeen(buf, (int)n);
            }
            else if (n > 0)
            {
                AppendDigit(buf, (int)n);
            }
        }

        static readonly string[] DigitNames = new string[] { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" };
        static readonly string[] TeenNames = new string[] { "10", "11", "12", "13", "14", "15", "16", "17", "18", "19" };
        static readonly string[] TensNames = new string[] { "", "", "20", "30", "40", "50", "60", "70", "80", "90" };

        void AppendDigit(List<byte> buf, int d) => AppendSymbol(buf, DigitNames[d]);
        void AppendTeen(List<byte> buf, int n) => AppendSymbol(buf, TeenNames[n - 10]);
        void AppendTens(List<byte> buf, int t) => AppendSymbol(buf, TensNames[t]);

        void AppendSymbol(List<byte> buf, string sym)
        {
            if (buf.Count == 0) buf.Add(OP_WORD);
            byte[]? phons = _symbols.Search(sym);
            if (phons == null) return;
            buf.AddRange(phons);
        }

        // Stream → PhonemeToken list

        void AppendWordTokens(List<PhonemeToken> tokens, byte[] stream, bool isContent)
        {
            long pending = 0;
            int startIdx = tokens.Count;
            bool hadPrimary = false;

            foreach (byte b in stream)
            {
                switch (b)
                {
                    case OP_WORD:
                        pending |= kWord_Start;
                        if (isContent) pending |= kContent_Word;
                        break;
                    case OP_STRESS1:
                        // Function words: demote dict primary stress to secondary so they
                        // don't trigger pitch peaks in the BackEnd pitch algorithm.
                        if (isContent) { pending |= kPrimaryStress; hadPrimary = true; }
                        else pending |= kSecondaryStress;
                        break;
                    case OP_STRESS2: pending |= kSecondaryStress; break;
                    case OP_EMPHSTRESS: pending |= kEmphaticStress; break;
                    case OP_SYLL: pending |= kSyllable_Start; break;
                    case OP_PREP: pending |= kPrep_Start; break;
                    case OP_VERB: pending |= kVerb_Start; break;
                    case OP_COMMA:
                    case OP_PERIOD:
                    case OP_QUEST:
                    case OP_EXCLAM:
                        tokens.Add(new PhonemeToken { Phon = (short)b, Ctrl = kTerm_Bound });
                        pending = 0;
                        break;
                    default:
                        if (b <= 55)
                        {
                            tokens.Add(new PhonemeToken { Phon = (short)b, Ctrl = pending });
                            pending = 0;
                        }
                        break;
                }
            }

            // Content word with only secondary stress: promote to primary so the pitch
            // algorithm has a peak to work with on words like "how".
            if (isContent && !hadPrimary)
            {
                for (int i = startIdx; i < tokens.Count; i++)
                {
                    if ((tokens[i].Ctrl & kSecondaryStress) != 0)
                    {
                        tokens[i] = new PhonemeToken
                        {
                            Phon = tokens[i].Phon,
                            Ctrl = (tokens[i].Ctrl & ~kSecondaryStress) | kPrimaryStress,
                            UserPitch = tokens[i].UserPitch,
                            UserDur = tokens[i].UserDur,
                            UserNote = tokens[i].UserNote,
                            UserRate = tokens[i].UserRate,
                        };
                        break;
                    }
                }
            }
        }

    }
}  // namespace