Editor/Mcp/Docs/FuzzyIndex.cs
using Sandbox;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace SboxMcp.Mcp.Docs;
public sealed class IndexedField
{
public string Name { get; }
public double Boost { get; }
public IndexedField( string name, double boost ) { Name = name; Boost = boost; }
}
public sealed class FuzzyIndexConfig
{
public IReadOnlyList<IndexedField> Fields { get; init; } = Array.Empty<IndexedField>();
public bool Prefix { get; init; } = true;
public double Fuzzy { get; init; } = 0.2;
}
public sealed class FuzzyHit<T>
{
public T Item { get; }
public double Score { get; }
public FuzzyHit( T item, double score ) { Item = item; Score = score; }
}
/// <summary>
/// Per-field inverted token map with TF + IDF + field boosts, plus optional
/// prefix matching and bounded edit-distance fuzzy fallback. Pragmatic
/// re-implementation of the slice of MiniSearch we need; indexes a few thousand
/// small documents with millisecond query latency.
/// </summary>
public sealed class FuzzyIndex<T>
{
private static readonly Regex Tokenizer = new( @"[\p{L}\p{N}_]+", RegexOptions.Compiled );
private readonly FuzzyIndexConfig _config;
private readonly Func<T, IReadOnlyDictionary<string, string>> _extract;
private readonly List<T> _items = new();
private readonly Dictionary<string, Dictionary<string, List<(int DocId, int Count)>>> _index = new();
private readonly Dictionary<string, List<int>> _fieldLengths = new();
public FuzzyIndex( FuzzyIndexConfig config, Func<T, IReadOnlyDictionary<string, string>> extract )
{
_config = config;
_extract = extract;
foreach ( var f in _config.Fields )
{
_index[f.Name] = new Dictionary<string, List<(int, int)>>( StringComparer.Ordinal );
_fieldLengths[f.Name] = new List<int>();
}
}
public int Count => _items.Count;
public T this[int idx] => _items[idx];
public void Clear()
{
_items.Clear();
foreach ( var f in _config.Fields )
{
_index[f.Name].Clear();
_fieldLengths[f.Name].Clear();
}
}
public void AddRange( IEnumerable<T> items )
{
foreach ( var item in items ) Add( item );
}
public void Add( T item )
{
var docId = _items.Count;
_items.Add( item );
var fieldValues = _extract( item );
foreach ( var f in _config.Fields )
{
fieldValues.TryGetValue( f.Name, out var text );
var tokens = Tokenize( text ?? "" );
_fieldLengths[f.Name].Add( tokens.Count );
var counts = new Dictionary<string, int>( StringComparer.Ordinal );
foreach ( var t in tokens )
counts[t] = counts.TryGetValue( t, out var c ) ? c + 1 : 1;
var idx = _index[f.Name];
foreach ( var (token, count) in counts )
{
if ( !idx.TryGetValue( token, out var list ) )
{
list = new List<(int, int)>();
idx[token] = list;
}
list.Add( (docId, count) );
}
}
}
public List<FuzzyHit<T>> Search( string query, int limit )
{
if ( string.IsNullOrWhiteSpace( query ) || _items.Count == 0 )
return new List<FuzzyHit<T>>();
var queryTokens = Tokenize( query );
if ( queryTokens.Count == 0 ) return new List<FuzzyHit<T>>();
var scores = new Dictionary<int, double>();
foreach ( var qToken in queryTokens )
{
foreach ( var f in _config.Fields )
{
var fieldIdx = _index[f.Name];
var lengths = _fieldLengths[f.Name];
if ( fieldIdx.TryGetValue( qToken, out var exactList ) )
Accumulate( scores, exactList, _items.Count, f.Boost, 1.0, lengths );
if ( _config.Prefix && qToken.Length >= 2 )
{
foreach ( var (token, list) in fieldIdx )
{
if ( token.Length > qToken.Length && token.StartsWith( qToken, StringComparison.Ordinal ) )
Accumulate( scores, list, _items.Count, f.Boost, 0.6, lengths );
}
}
if ( _config.Fuzzy > 0 && qToken.Length >= 4 && !fieldIdx.ContainsKey( qToken ) )
{
var maxDist = (int)Math.Floor( qToken.Length * _config.Fuzzy );
if ( maxDist >= 1 )
{
foreach ( var (token, list) in fieldIdx )
{
if ( Math.Abs( token.Length - qToken.Length ) > maxDist ) continue;
var d = LevenshteinBounded( qToken, token, maxDist );
if ( d >= 0 && d <= maxDist )
{
var w = 1.0 - (double)d / (qToken.Length + 1);
Accumulate( scores, list, _items.Count, f.Boost, w * 0.5, lengths );
}
}
}
}
}
}
return scores
.OrderByDescending( kv => kv.Value )
.Take( limit )
.Select( kv => new FuzzyHit<T>( _items[kv.Key], kv.Value ) )
.ToList();
}
private static void Accumulate(
Dictionary<int, double> scores,
List<(int DocId, int Count)> postings,
int totalDocs,
double fieldBoost,
double weight,
List<int> fieldLengths )
{
if ( postings.Count == 0 ) return;
var idf = Math.Log( (totalDocs - postings.Count + 0.5) / (postings.Count + 0.5) + 1.0 );
if ( idf <= 0 ) idf = 0.01;
foreach ( var (docId, count) in postings )
{
var len = docId < fieldLengths.Count ? fieldLengths[docId] : 0;
var tf = count / (count + 1.0 + 0.001 * len);
var contribution = idf * tf * fieldBoost * weight;
scores[docId] = scores.TryGetValue( docId, out var existing ) ? existing + contribution : contribution;
}
}
internal static List<string> Tokenize( string text )
{
if ( string.IsNullOrEmpty( text ) ) return new List<string>();
var matches = Tokenizer.Matches( text );
var result = new List<string>( matches.Count );
foreach ( Match m in matches )
{
if ( m.Length < 2 ) continue;
result.Add( m.Value.ToLowerInvariant() );
}
return result;
}
internal static int LevenshteinBounded( string a, string b, int maxDist )
{
if ( a == b ) return 0;
var la = a.Length;
var lb = b.Length;
if ( Math.Abs( la - lb ) > maxDist ) return -1;
var prev = new int[lb + 1];
var curr = new int[lb + 1];
for ( var j = 0; j <= lb; j++ ) prev[j] = j;
for ( var i = 1; i <= la; i++ )
{
curr[0] = i;
var rowMin = curr[0];
for ( var j = 1; j <= lb; j++ )
{
var cost = a[i - 1] == b[j - 1] ? 0 : 1;
curr[j] = Math.Min(
Math.Min( curr[j - 1] + 1, prev[j] + 1 ),
prev[j - 1] + cost );
if ( curr[j] < rowMin ) rowMin = curr[j];
}
if ( rowMin > maxDist ) return -1;
(prev, curr) = (curr, prev);
}
return prev[lb];
}
}