Code/LexiconReader.cs
#nullable enable
using System;
using System.Buffers.Binary;

namespace SharpTalk
{

    public class DictReader
    {
        readonly byte[] _data;
        readonly int _wordsOff;
        readonly int[] _index;   // resolved per-entry byte offsets into _data
        readonly uint[] _hash;   // 27 entries: hash['A'-'A'] .. hash['Z'-'A'+1]
        readonly int _wordCount;

        // HASH_ENTRIES = 'Z'-'A'+2 = 27
        const int HASH_ENTRIES = 27;
        const byte kEndFlag = 0x80;
        const byte kAltFlag = 0xFF;

        public DictReader(byte[] data)
        {
            _data = data;

            // Header layout (all big-endian):
            //   0  uint32 nextDict_off
            //  4  uint32 version
            //   8  uint32 type
            //  12  uint32 wordCount
            //  16  uint32 hash[27]       (108 bytes)
            // 124  short  POScodes[128][4] (1024 bytes, ignored for now)
            // 1148 uint32 words_off
            // 1152 uint32 index_off
            // 1156 uint32 flags

            _wordCount = (int)BinaryPrimitives.ReadUInt32BigEndian(data.AsSpan(12));

            _hash = new uint[HASH_ENTRIES];
            for (int i = 0; i < HASH_ENTRIES; i++)
                _hash[i] = BinaryPrimitives.ReadUInt32BigEndian(data.AsSpan(16 + i * 4));

            _wordsOff = (int)BinaryPrimitives.ReadUInt32BigEndian(data.AsSpan(1148));
            int indexOff = (int)BinaryPrimitives.ReadUInt32BigEndian(data.AsSpan(1152));

            // Expand 32-bit big-endian index entries to int offsets
            _index = new int[_wordCount];
            for (int i = 0; i < _wordCount; i++)
                _index[i] = (int)BinaryPrimitives.ReadUInt32BigEndian(data.AsSpan(indexOff + i * 4));
        }

        public byte[]? Search(string word)
        {
            if (_wordCount == 0 || word.Length == 0) return null;

            int tLen = word.Length;
            char first = word[0];

            int lo, hi;
            if (first >= 'A' && first <= 'Z')
            {
                int letterIdx = first - 'A';
                lo = (int)_hash[letterIdx];
                hi = (int)_hash[letterIdx + 1] - 1;
            }
            else if (first < 'A')
            {
                lo = 0;
                hi = (int)_hash[0] - 1;
            }
            else // > 'Z'
            {
                lo = (int)_hash['Z' - 'A'];
                hi = _wordCount - 1;
            }

            while (lo <= hi)
            {
                int mid = (lo + hi) >> 1;
                int off = _index[mid];
                int dLen = _data[off];
                int diff = 0;
                int cmp = Math.Min(tLen, dLen);
                for (int i = 0; i < cmp; i++)
                {
                    diff = word[i] - _data[off + 1 + i];
                    if (diff != 0) break;
                }
                if (diff == 0) diff = tLen - dLen;

                if (diff > 0) lo = mid + 1;
                else if (diff < 0) hi = mid - 1;
                else
                {
                    // Found — copy phoneme bytes (up to kEndFlag)
                    int pStart = off + dLen + 1;
                    int pEnd = pStart;
                    while (pEnd < _data.Length && (_data[pEnd] & kEndFlag) == 0)
                        pEnd++;
                    byte[] phons = new byte[pEnd - pStart];
                    _data.AsSpan(pStart, phons.Length).CopyTo(phons);
                    return phons;
                }
            }
            return null;
        }

        public System.Collections.Generic.IEnumerable<(string word, byte[] phons)> EnumerateAll()
        {
            for (int i = 0; i < _wordCount; i++)
            {
                int off  = _index[i];
                int dLen = _data[off];
                string word = System.Text.Encoding.ASCII.GetString(_data, off + 1, dLen);
                int pStart = off + dLen + 1;
                int pEnd   = pStart;
                while (pEnd < _data.Length && (_data[pEnd] & kEndFlag) == 0) pEnd++;
                byte[] phons = new byte[pEnd - pStart];
                _data.AsSpan(pStart, phons.Length).CopyTo(phons);
                yield return (word, phons);
            }
        }

        public int WordCount => _wordCount;
    }
}  // namespace