Code/TextCommands.cs
#nullable enable
using System;
using System.Collections.Generic;
using static SharpTalk.AudioProcessor;
namespace SharpTalk
{
public static class EmbeddedCmd
{
// ASCII note name (C5, A#4, Bb3) -> Hz using equal temperament (A4 = 440 Hz)
static int NoteNameToHz(string name)
{
if (name.Length < 2) return 0;
int semitone = char.ToUpperInvariant(name[0]) switch
{
'C' => 0, 'D' => 2, 'E' => 4, 'F' => 5,
'G' => 7, 'A' => 9, 'B' => 11, _ => -1,
};
if (semitone < 0) return 0;
int pos = 1;
if (pos < name.Length && name[pos] == '#') { semitone++; pos++; }
else if (pos < name.Length && name[pos] == 'b') { semitone--; pos++; }
if (pos >= name.Length || !int.TryParse(name[pos..], out int octave)) return 0;
int midi = 12 * (octave + 1) + semitone;
return (int)Math.Round(440.0 * Math.Pow(2.0, (midi - 69) / 12.0));
}
static short MapPhoneme(string p) => p switch
{
// Vowels
"iy" => _IY_, "ih" => _IH_, "eh" => _EH_, "ae" => _AE_,
"aa" => _AA_, "ah" => _AH_, "ao" => _AO_, "uh" => _UH_,
"ax" => _AX_, "er" => _ER_, "ey" => _EY_, "ay" => _AY_,
"oy" => _OY_, "aw" => _AW_, "ow" => _OW_, "uw" => _UW_,
"ix" => _IX_,
// Single-char vowel shortcuts, "i"->IH, "e"->EH, "a"->AE, "o"->AO, "u"->UW
// Allows compact notation like "KIT" instead of "KIHT".
"i" => _IH_, "e" => _EH_, "a" => _AE_, "o" => _AO_, "u" => _UW_,
// Sonorants
"w" => _W_, "y" => _Y_, "r" => _R_, "l" => _L_,
// Nasals
"m" => _M_, "n" => _N_, "ng" => _NG_,
// Fricatives
"hh" => _HH_, "f" => _F_, "v" => _V_,
"th" => _TH_, "dh" => _DH_,
"s" => _S_, "z" => _Z_,
"sh" => _SH_, "zh" => _ZH_,
// Stops
"p" => _P_, "b" => _B_,
"t" => _T_, "d" => _D_, "dx" => _DX_,
"k" => _K_, "g" => _G_,
// Affricates
"ch" => _CH_, "jh" => _JH_,
// Japanese vowels
"jp_iy" => _JPI_, "jp_eh" => _JPE_, "jp_aa" => _JPA_,
"jp_ow" => _JPO_, "jp_uw" => _JPU_,
// Silence / rest
"_" => _SIL_,
_ => -1,
};
public readonly struct VoiceCommand
{
public enum Kind { Rate, Pitch, Volume }
public readonly Kind Type;
public readonly int Value;
public VoiceCommand(Kind type, int value) { Type = type; Value = value; }
}
public static bool KlattschMode = false;
public readonly struct Segment
{
public readonly string? PlainText;
public readonly List<PhonemeToken>? Singing;
public readonly VoiceCommand? Cmd;
public readonly string? KlattschText;
public bool IsSinging => Singing != null;
public bool IsCommand => Cmd != null;
public bool IsKlattsch => KlattschText != null;
public Segment(string text) { PlainText = text; Singing = null; Cmd = null; KlattschText = null; }
public Segment(List<PhonemeToken> s) { PlainText = null; Singing = s; Cmd = null; KlattschText = null; }
public Segment(VoiceCommand cmd) { PlainText = null; Singing = null; Cmd = cmd; KlattschText = null; }
public static Segment Klattsch(string text) => new Segment(null, null, null, text);
private Segment(string? p, List<PhonemeToken>? s, VoiceCommand? c, string? k) { PlainText = p; Singing = s; Cmd = c; KlattschText = k; }
}
public static List<Segment> ParseSegments(string text)
{
var segments = new List<Segment>();
if (KlattschMode)
{
// In Klattsch mode, we still look for [:klattsch off]
int offIdx = text.IndexOf("[:klattsch off]", StringComparison.OrdinalIgnoreCase);
if (offIdx >= 0)
{
string before = text[..offIdx];
if (before.Length > 0) segments.Add(Segment.Klattsch(before));
KlattschMode = false;
string after = text[(offIdx + "[:klattsch off]".Length)..];
if (after.Length > 0) segments.AddRange(ParseSegments(after));
return segments;
}
segments.Add(Segment.Klattsch(text));
return segments;
}
if (!text.Contains('['))
{
if (text.Length > 0) segments.Add(new Segment(text));
return segments;
}
var plain = new System.Text.StringBuilder();
bool inSingMode = false;
int i = 0;
void FlushPlain()
{
if (plain.Length > 0) { segments.Add(new Segment(plain.ToString())); plain.Clear(); }
}
while (i < text.Length)
{
if (text[i] != '[') { plain.Append(text[i++]); continue; }
i++; // consume '['
if (i >= text.Length) break;
if (text[i] == ':')
{
i++; // Skip ':'
int cmdStart = i;
while (i < text.Length && !char.IsWhiteSpace(text[i]) && text[i] != ']') i++;
string cmd = text[cmdStart..i].ToLowerInvariant();
while (i < text.Length && char.IsWhiteSpace(text[i])) i++;
int argStart = i;
while (i < text.Length && text[i] != ']') i++;
string argStr = text[argStart..i].Trim().ToLowerInvariant();
if (i < text.Length) i++; // consume ']'
if (cmd == "klattsch")
{
if (argStr == "on")
{
FlushPlain();
KlattschMode = true;
KlattschParser.Reset();
string rest = text[i..];
segments.AddRange(ParseSegments(rest));
return segments;
}
else if (argStr == "off")
{
KlattschMode = false;
}
}
else if (cmd == "sing") inSingMode = true;
else if (cmd == "talk" || cmd == "stop") inSingMode = false;
else if (int.TryParse(argStr, out int argVal))
{
VoiceCommand.Kind? kind = cmd switch { "rate" => VoiceCommand.Kind.Rate, "pitch" => VoiceCommand.Kind.Pitch, "volume" => VoiceCommand.Kind.Volume, _ => null };
if (kind is { } k) { FlushPlain(); segments.Add(new Segment(new VoiceCommand(k, argVal))); }
}
continue;
}
// Phoneme block [phoneme<dur,note> ...]
var blockSing = new List<PhonemeToken>();
bool firstPhon = true;
bool firstInBlock = true; // Track first note in the [...] block
short lastPitch = 0; // inherited by trailing consonants with no <note>
while (i < text.Length && text[i] != ']')
{
while (i < text.Length && text[i] == ' ') i++;
if (i >= text.Length || text[i] == ']') break;
if (text[i] == '_' || char.IsLetter(text[i]))
{
// Collect all phonemes up to '<', ']', or ' '
// "dey<600,24>" -> [d, ey] with dur=600 note=24
var group = new List<short>();
while (i < text.Length && text[i] != '<' && text[i] != ']' && text[i] != ' ')
{
if ((text[i] == 'J' || text[i] == 'j') && i + 3 < text.Length
&& (text[i+1] == 'P' || text[i+1] == 'p') && text[i+2] == '_')
{
bool matchedJp = false;
if (i + 4 < text.Length && char.IsLetter(text[i+3]) && char.IsLetter(text[i+4]))
{
string code5 = ("jp_" + text[i+3] + text[i+4]).ToLowerInvariant();
short p5 = MapPhoneme(code5);
if (p5 >= 0) { group.Add(p5); i += 5; matchedJp = true; }
}
if (!matchedJp && char.IsLetter(text[i+3]))
{
string code4 = ("jp_" + text[i+3]).ToLowerInvariant();
short p4 = MapPhoneme(code4);
if (p4 >= 0) { group.Add(p4); i += 4; matchedJp = true; }
}
if (matchedJp) continue;
}
if (text[i] == '_') { group.Add(_SIL_); i++; continue; }
bool matched2 = false;
if (i + 1 < text.Length && char.IsLetter(text[i + 1]))
{
string two = string.Concat(text[i], text[i + 1]).ToLowerInvariant();
short op2 = MapPhoneme(two);
if (op2 >= 0) { group.Add(op2); i += 2; matched2 = true; }
}
if (!matched2)
{
string one = text[i].ToString().ToLowerInvariant();
short op1 = MapPhoneme(one);
group.Add(op1 >= 0 ? op1 : _SIL_);
i++;
}
}
int dur = 0, note = 0;
bool hasNote = false, noteIsNamed = false;
if (i < text.Length && text[i] == '<')
{
hasNote = true;
i++;
while (i < text.Length && char.IsDigit(text[i]))
dur = dur * 10 + (text[i++] - '0');
if (i < text.Length && text[i] == ',')
{
i++;
while (i < text.Length && text[i] == ' ') i++;
if (i < text.Length && char.IsLetter(text[i]))
{
int nameStart = i;
while (i < text.Length && text[i] != '>' && text[i] != ']') i++;
note = NoteNameToHz(text[nameStart..i].Trim());
noteIsNamed = true;
}
else
{
while (i < text.Length && char.IsDigit(text[i]))
note = note * 10 + (text[i++] - '0');
}
}
while (i < text.Length && text[i] != '>' && text[i] != ']') i++;
if (i < text.Length && text[i] == '>') i++;
}
if (!hasNote && !inSingMode && blockSing.Count == 0) continue;
short pitch = hasNote
? (noteIsNamed ? (short)note : (short)-note)
: lastPitch;
if (hasNote) lastPitch = pitch;
int durIdx = group.Count - 1;
// Subtract every other phoneme's minimum duration from the
// user-specified duration so the whole cluster fits the beat.
// We account for the 5ms initial silence and backend frame rounding.
int overhead = firstInBlock ? 5 : 0;
firstInBlock = false;
for (int gi2 = 0; gi2 < group.Count; gi2++)
{
if (gi2 == durIdx) continue;
short p = group[gi2];
int m = (p == _SIL_) ? 5 : Tables.MinDurTbl[p];
overhead += (m / 5) * 5;
}
int adjustedDur = Math.Max(5, dur - overhead);
for (int gi = 0; gi < group.Count; gi++)
{
long ctrl = kWord_Start | kContent_Word;
if (pitch != 0) ctrl |= kSingingPhon;
if (hasNote && gi == durIdx) ctrl |= kSingingDuration;
if (firstPhon) { firstPhon = false; }
else ctrl &= ~(kWord_Start | kContent_Word);
blockSing.Add(new PhonemeToken
{
Phon = group[gi],
Ctrl = ctrl,
UserDur = hasNote && gi == durIdx ? (short)adjustedDur : (short)0,
UserNote = (hasNote && gi == durIdx) ? pitch : (short)0,
});
}
}
else { i++; }
}
if (i < text.Length && text[i] == ']') i++;
if (blockSing.Count > 0)
{
FlushPlain();
segments.Add(new Segment(blockSing));
}
}
FlushPlain();
return segments;
}
public static string Parse(string text, out List<PhonemeToken>? singingTokens)
{
singingTokens = null;
var segments = ParseSegments(text);
var plain = new System.Text.StringBuilder();
List<PhonemeToken>? sing = null;
foreach (var seg in segments)
{
if (seg.IsSinging)
{
sing ??= new List<PhonemeToken>();
sing.AddRange(seg.Singing!);
}
else if (!seg.IsCommand)
{
plain.Append(seg.PlainText);
}
}
singingTokens = sing;
return plain.ToString();
}
public static string StripCommands(string text)
{
var result = Parse(text, out _);
return result;
}
}
} // namespace