GraphemeToPhoneme.cs
#nullable enable
namespace SharpTalk
{

    public static class EngToP
    {
        const byte DELIMITER = 0xFF;
        const byte CONSON = 0x01;
        const byte HISCON = 0x02;
        const byte VOICON = 0x08;
        const byte VOWEL = 0x10;
        const byte FRONT = 0x20;
        const byte SPCHAR = 0x80;

        public static byte[] Convert(string word)
        {
            if (string.IsNullOrEmpty(word)) return new byte[0];

            // Build working buffer: ' ' + WORD + ' ' ' '
            // Extra trailing spaces give dorule safe lookahead past word end.
            int len = word.Length;
            byte[] inp = new byte[len + 3];
            inp[0] = (byte)' ';
            for (int i = 0; i < len; i++)
                inp[i + 1] = (byte)char.ToUpperInvariant(word[i]);
            inp[len + 1] = (byte)' ';
            inp[len + 2] = (byte)' ';

            var phons = new System.Collections.Generic.List<byte>(len * 3);
            int iPos = 1; // start past the leading space

            while (inp[iPos] != ' ')
            {
                byte c = inp[iPos];
                if (c == '\'' || c == '.')
                {
                    iPos++;
                    continue;
                }

                int letterIdx = c - 'A';
                if (letterIdx < 0 || letterIdx >= 26) { iPos++; continue; }

                int ruleIdx = Tables.EngToPHash[letterIdx];

                while (true) // try each rule for this letter
                {
                    int nextRule = ruleIdx + Tables.EngToPRules[ruleIdx];
                    int rPos = ruleIdx + 1; // past the length byte

                    // Match the middle pattern: each rule byte must equal inp[sPos+1], sPos+2, ...
                    int sPos = iPos;
                    bool midMatch = true;
                    while (true)
                    {
                        sPos++;
                        byte rByte = Tables.EngToPRules[rPos];
                        if (rByte == DELIMITER) break;
                        if (sPos >= inp.Length || inp[sPos] != rByte) { midMatch = false; break; }
                        rPos++;
                    }

                    if (!midMatch || Tables.EngToPRules[rPos] != DELIMITER)
                    {
                        ruleIdx = nextRule;
                        continue;
                    }
                    rPos++; // past first DELIMITER

                    // Left context (direction = -1, start one before current letter)
                    int leftIPos = iPos - 1;
                    int rPosAfterLeft = DoRule(inp, ref leftIPos, rPos, -1);
                    if (rPosAfterLeft < 0) { ruleIdx = nextRule; continue; }

                    // Right context (direction = +1, start at first unmatched letter)
                    int rightIPos = sPos;
                    int rPosAfterRight = DoRule(inp, ref rightIPos, rPosAfterLeft, 1);
                    if (rPosAfterRight < 0) { ruleIdx = nextRule; continue; }

                    // Found a match — emit output phonemes (stored with +1 bias to avoid 0=IY)
                    iPos = sPos;
                    int oPos = rPosAfterRight;
                    while (Tables.EngToPRules[oPos] != DELIMITER)
                        phons.Add((byte)(Tables.EngToPRules[oPos++] - 1));
                    break;
                }
            }

            return phons.ToArray();
        }

        // Matches one side of a rule against the input.
        // Returns the index just past the trailing DELIMITER on success, or -1 on failure.
        static int DoRule(byte[] inp, ref int iPos, int rPos, int direction)
        {
            if (Tables.EngToPRules[rPos] == DELIMITER)
                return rPos + 1; // empty context — always matches

            while (Tables.EngToPRules[rPos] != DELIMITER)
            {
                byte rc = Tables.EngToPRules[rPos];

                if (Tables.EngToPKind[rc] == SPCHAR)
                {
                    switch ((char)rc)
                    {
                        case '*': // one or more consonants
                            if (FindConsonant(inp, ref iPos, direction) != 0) return -1;
                            while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
                            {
                                int saved = iPos;
                                int r2 = DoRule(inp, ref saved, rPos + 1, direction);
                                if (r2 >= 0) { iPos = saved; return r2; }
                                iPos += direction;
                            }
                            break;

                        case '$': // exactly one vowel
                            if (FindVowel(inp, ref iPos, direction) != 0) return -1;
                            break;

                        case '^': // exactly one consonant
                            if (FindConsonant(inp, ref iPos, direction) != 0) return -1;
                            break;

                        case ':': // zero or more consonants (greedy with backtrack)
                            FindConsonant(inp, ref iPos, direction);
                            while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
                            {
                                int saved = iPos;
                                int r2 = DoRule(inp, ref saved, rPos + 1, direction);
                                if (r2 >= 0) { iPos = saved; return r2; }
                                iPos += direction;
                            }
                            break;

                        case '+': // front vowel
                            if (iPos < 0 || iPos >= inp.Length || (Tables.EngToPKind[inp[iPos]] & FRONT) == 0) return -1;
                            iPos += direction;
                            break;

                        case 'v': // zero or more vowels
                            FindVowel(inp, ref iPos, direction);
                            while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
                            {
                                int saved = iPos;
                                int r2 = DoRule(inp, ref saved, rPos + 1, direction);
                                if (r2 >= 0) { iPos = saved; return r2; }
                                iPos += direction;
                            }
                            break;

                        case '#': // one or more vowels
                            if (FindVowel(inp, ref iPos, direction) != 0) return -1;
                            while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
                            {
                                int saved = iPos;
                                int r2 = DoRule(inp, ref saved, rPos + 1, direction);
                                if (r2 >= 0) { iPos = saved; return r2; }
                                iPos += direction;
                            }
                            break;

                        case '.': // voiced consonant
                            if (iPos < 0 || iPos >= inp.Length || (Tables.EngToPKind[inp[iPos]] & VOICON) == 0) return -1;
                            iPos += direction;
                            break;

                        case '&': // sibilant consonant
                            if (FindSibilant(inp, ref iPos, direction) != 0) return -1;
                            break;

                        case 'l': if (SearchSpecial(inp, ref iPos, Tables.EngToPL, direction) != 0) return -1; break;
                        case '-': if (SearchSpecial(inp, ref iPos, Tables.EngToPDash, direction) != 0) return -1; break;
                        case '%': if (SearchSpecial(inp, ref iPos, Tables.EngToPPercent, direction) != 0) return -1; break;
                        case 'z': if (SearchSpecial(inp, ref iPos, Tables.EngToPZ, direction) != 0) return -1; break;
                        case 'b': if (SearchSpecial(inp, ref iPos, Tables.EngToPB, direction) != 0) return -1; break;
                        case '@': if (SearchSpecial(inp, ref iPos, Tables.EngToPAt, direction) != 0) return -1; break;
                        case 'm': if (SearchSpecial(inp, ref iPos, Tables.EngToPM, direction) != 0) return -1; break;
                    }
                    rPos++;
                }
                else // normal character — must match exactly
                {
                    if (iPos < 0 || iPos >= inp.Length || inp[iPos] != rc) return -1;
                    iPos += direction;
                    rPos++;
                }
            }
            return rPos + 1; // past trailing DELIMITER
        }

        // Returns 0 on match (iPos advanced), -1 on no match.
        static int FindConsonant(byte[] inp, ref int iPos, int direction)
        {
            if (iPos < 0 || iPos >= inp.Length) return -1;
            if ((Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
            {
                iPos += direction;
                return 0;
            }
            // QU / GU special case
            if (direction == -1)
            {
                if (inp[iPos] == 'U' && iPos > 0 && (inp[iPos - 1] == 'G' || inp[iPos - 1] == 'Q'))
                {
                    iPos -= 2;
                    return 0;
                }
            }
            else
            {
                if ((inp[iPos] == 'Q' || inp[iPos] == 'G') && iPos + 1 < inp.Length && inp[iPos + 1] == 'U')
                {
                    iPos += 2;
                    return 0;
                }
            }
            return -1;
        }

        static int FindSibilant(byte[] inp, ref int iPos, int direction)
        {
            if (iPos < 0 || iPos >= inp.Length) return -1;
            if ((Tables.EngToPKind[inp[iPos]] & HISCON) != 0)
            {
                iPos += direction;
                return 0;
            }
            if (direction == 1)
            {
                if ((inp[iPos] == 'C' || inp[iPos] == 'S') && iPos + 1 < inp.Length && inp[iPos + 1] == 'H')
                {
                    iPos += 2;
                    return 0;
                }
            }
            else
            {
                if (inp[iPos] == 'H' && iPos > 0 && (inp[iPos - 1] == 'C' || inp[iPos - 1] == 'S'))
                {
                    iPos -= 2;
                    return 0;
                }
            }
            return -1;
        }

        static int FindVowel(byte[] inp, ref int iPos, int direction)
        {
            if (iPos < 0 || iPos >= inp.Length) return -1;
            if ((Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
            {
                iPos += direction;
                return 0;
            }
            return -1;
        }

        // Searches a comma-delimited special rule table (null-terminated).
        // Returns 0 on match (iPos advanced), -1 on no match.
        static int SearchSpecial(byte[] inp, ref int iPos, byte[] table, int direction)
        {
            int tPos = 0;
            int curIPos = iPos;

            while (tPos < table.Length && table[tPos] != 0)
            {
                int ti = tPos;
                int ii = curIPos;
                bool match = false;

                while (ti < table.Length && table[ti] != ',' && table[ti] != 0)
                {
                    if (ii < 0 || ii >= inp.Length || inp[ii] != table[ti])
                        break;
                    ii += direction;
                    ti++;
                }

                if (ti < table.Length && table[ti] == ',')
                    match = true;

                if (match)
                {
                    iPos = ii;
                    return 0;
                }

                // Advance to next entry (scan past the comma)
                while (tPos < table.Length && table[tPos] != ',' && table[tPos] != 0)
                    tPos++;
                if (tPos < table.Length && table[tPos] == ',')
                    tPos++;
                curIPos = iPos; // reset input pointer for next entry
            }
            return -1;
        }
    }
}  // namespace