GraphemeToPhoneme.cs
#nullable enable
namespace SharpTalk
{
public static class EngToP
{
const byte DELIMITER = 0xFF;
const byte CONSON = 0x01;
const byte HISCON = 0x02;
const byte VOICON = 0x08;
const byte VOWEL = 0x10;
const byte FRONT = 0x20;
const byte SPCHAR = 0x80;
public static byte[] Convert(string word)
{
if (string.IsNullOrEmpty(word)) return new byte[0];
// Build working buffer: ' ' + WORD + ' ' ' '
// Extra trailing spaces give dorule safe lookahead past word end.
int len = word.Length;
byte[] inp = new byte[len + 3];
inp[0] = (byte)' ';
for (int i = 0; i < len; i++)
inp[i + 1] = (byte)char.ToUpperInvariant(word[i]);
inp[len + 1] = (byte)' ';
inp[len + 2] = (byte)' ';
var phons = new System.Collections.Generic.List<byte>(len * 3);
int iPos = 1; // start past the leading space
while (inp[iPos] != ' ')
{
byte c = inp[iPos];
if (c == '\'' || c == '.')
{
iPos++;
continue;
}
int letterIdx = c - 'A';
if (letterIdx < 0 || letterIdx >= 26) { iPos++; continue; }
int ruleIdx = Tables.EngToPHash[letterIdx];
while (true) // try each rule for this letter
{
int nextRule = ruleIdx + Tables.EngToPRules[ruleIdx];
int rPos = ruleIdx + 1; // past the length byte
// Match the middle pattern: each rule byte must equal inp[sPos+1], sPos+2, ...
int sPos = iPos;
bool midMatch = true;
while (true)
{
sPos++;
byte rByte = Tables.EngToPRules[rPos];
if (rByte == DELIMITER) break;
if (sPos >= inp.Length || inp[sPos] != rByte) { midMatch = false; break; }
rPos++;
}
if (!midMatch || Tables.EngToPRules[rPos] != DELIMITER)
{
ruleIdx = nextRule;
continue;
}
rPos++; // past first DELIMITER
// Left context (direction = -1, start one before current letter)
int leftIPos = iPos - 1;
int rPosAfterLeft = DoRule(inp, ref leftIPos, rPos, -1);
if (rPosAfterLeft < 0) { ruleIdx = nextRule; continue; }
// Right context (direction = +1, start at first unmatched letter)
int rightIPos = sPos;
int rPosAfterRight = DoRule(inp, ref rightIPos, rPosAfterLeft, 1);
if (rPosAfterRight < 0) { ruleIdx = nextRule; continue; }
// Found a match — emit output phonemes (stored with +1 bias to avoid 0=IY)
iPos = sPos;
int oPos = rPosAfterRight;
while (Tables.EngToPRules[oPos] != DELIMITER)
phons.Add((byte)(Tables.EngToPRules[oPos++] - 1));
break;
}
}
return phons.ToArray();
}
// Matches one side of a rule against the input.
// Returns the index just past the trailing DELIMITER on success, or -1 on failure.
static int DoRule(byte[] inp, ref int iPos, int rPos, int direction)
{
if (Tables.EngToPRules[rPos] == DELIMITER)
return rPos + 1; // empty context — always matches
while (Tables.EngToPRules[rPos] != DELIMITER)
{
byte rc = Tables.EngToPRules[rPos];
if (Tables.EngToPKind[rc] == SPCHAR)
{
switch ((char)rc)
{
case '*': // one or more consonants
if (FindConsonant(inp, ref iPos, direction) != 0) return -1;
while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
{
int saved = iPos;
int r2 = DoRule(inp, ref saved, rPos + 1, direction);
if (r2 >= 0) { iPos = saved; return r2; }
iPos += direction;
}
break;
case '$': // exactly one vowel
if (FindVowel(inp, ref iPos, direction) != 0) return -1;
break;
case '^': // exactly one consonant
if (FindConsonant(inp, ref iPos, direction) != 0) return -1;
break;
case ':': // zero or more consonants (greedy with backtrack)
FindConsonant(inp, ref iPos, direction);
while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
{
int saved = iPos;
int r2 = DoRule(inp, ref saved, rPos + 1, direction);
if (r2 >= 0) { iPos = saved; return r2; }
iPos += direction;
}
break;
case '+': // front vowel
if (iPos < 0 || iPos >= inp.Length || (Tables.EngToPKind[inp[iPos]] & FRONT) == 0) return -1;
iPos += direction;
break;
case 'v': // zero or more vowels
FindVowel(inp, ref iPos, direction);
while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
{
int saved = iPos;
int r2 = DoRule(inp, ref saved, rPos + 1, direction);
if (r2 >= 0) { iPos = saved; return r2; }
iPos += direction;
}
break;
case '#': // one or more vowels
if (FindVowel(inp, ref iPos, direction) != 0) return -1;
while (iPos >= 0 && iPos < inp.Length && (Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
{
int saved = iPos;
int r2 = DoRule(inp, ref saved, rPos + 1, direction);
if (r2 >= 0) { iPos = saved; return r2; }
iPos += direction;
}
break;
case '.': // voiced consonant
if (iPos < 0 || iPos >= inp.Length || (Tables.EngToPKind[inp[iPos]] & VOICON) == 0) return -1;
iPos += direction;
break;
case '&': // sibilant consonant
if (FindSibilant(inp, ref iPos, direction) != 0) return -1;
break;
case 'l': if (SearchSpecial(inp, ref iPos, Tables.EngToPL, direction) != 0) return -1; break;
case '-': if (SearchSpecial(inp, ref iPos, Tables.EngToPDash, direction) != 0) return -1; break;
case '%': if (SearchSpecial(inp, ref iPos, Tables.EngToPPercent, direction) != 0) return -1; break;
case 'z': if (SearchSpecial(inp, ref iPos, Tables.EngToPZ, direction) != 0) return -1; break;
case 'b': if (SearchSpecial(inp, ref iPos, Tables.EngToPB, direction) != 0) return -1; break;
case '@': if (SearchSpecial(inp, ref iPos, Tables.EngToPAt, direction) != 0) return -1; break;
case 'm': if (SearchSpecial(inp, ref iPos, Tables.EngToPM, direction) != 0) return -1; break;
}
rPos++;
}
else // normal character — must match exactly
{
if (iPos < 0 || iPos >= inp.Length || inp[iPos] != rc) return -1;
iPos += direction;
rPos++;
}
}
return rPos + 1; // past trailing DELIMITER
}
// Returns 0 on match (iPos advanced), -1 on no match.
static int FindConsonant(byte[] inp, ref int iPos, int direction)
{
if (iPos < 0 || iPos >= inp.Length) return -1;
if ((Tables.EngToPKind[inp[iPos]] & CONSON) != 0)
{
iPos += direction;
return 0;
}
// QU / GU special case
if (direction == -1)
{
if (inp[iPos] == 'U' && iPos > 0 && (inp[iPos - 1] == 'G' || inp[iPos - 1] == 'Q'))
{
iPos -= 2;
return 0;
}
}
else
{
if ((inp[iPos] == 'Q' || inp[iPos] == 'G') && iPos + 1 < inp.Length && inp[iPos + 1] == 'U')
{
iPos += 2;
return 0;
}
}
return -1;
}
static int FindSibilant(byte[] inp, ref int iPos, int direction)
{
if (iPos < 0 || iPos >= inp.Length) return -1;
if ((Tables.EngToPKind[inp[iPos]] & HISCON) != 0)
{
iPos += direction;
return 0;
}
if (direction == 1)
{
if ((inp[iPos] == 'C' || inp[iPos] == 'S') && iPos + 1 < inp.Length && inp[iPos + 1] == 'H')
{
iPos += 2;
return 0;
}
}
else
{
if (inp[iPos] == 'H' && iPos > 0 && (inp[iPos - 1] == 'C' || inp[iPos - 1] == 'S'))
{
iPos -= 2;
return 0;
}
}
return -1;
}
static int FindVowel(byte[] inp, ref int iPos, int direction)
{
if (iPos < 0 || iPos >= inp.Length) return -1;
if ((Tables.EngToPKind[inp[iPos]] & VOWEL) != 0)
{
iPos += direction;
return 0;
}
return -1;
}
// Searches a comma-delimited special rule table (null-terminated).
// Returns 0 on match (iPos advanced), -1 on no match.
static int SearchSpecial(byte[] inp, ref int iPos, byte[] table, int direction)
{
int tPos = 0;
int curIPos = iPos;
while (tPos < table.Length && table[tPos] != 0)
{
int ti = tPos;
int ii = curIPos;
bool match = false;
while (ti < table.Length && table[ti] != ',' && table[ti] != 0)
{
if (ii < 0 || ii >= inp.Length || inp[ii] != table[ti])
break;
ii += direction;
ti++;
}
if (ti < table.Length && table[ti] == ',')
match = true;
if (match)
{
iPos = ii;
return 0;
}
// Advance to next entry (scan past the comma)
while (tPos < table.Length && table[tPos] != ',' && table[tPos] != 0)
tPos++;
if (tPos < table.Length && table[tPos] == ',')
tPos++;
curIPos = iPos; // reset input pointer for next entry
}
return -1;
}
}
} // namespace