Anna
d00b3b0845
Certain symbols are turned into one space so the model sees multiple words instead of one. Previously "[RP]Hi" would turn into "RPHi" and be its own token. Now it turns into "RP" and "Hi", counting as two tokens. This change increased the model's accuracy. Also make "18", "http", "https", and LGBT-related words into stop words (meaning they're ignored). Each of these stop words made the model more accurate and reduced unwanted bias. Messages destined for ML are now normalised by the plugin in the same way the model's input is for training. This should make the results come closer to expected.
25 lines
780 B
C#
25 lines
780 B
C#
using System;
|
|
using System.Linq;
|
|
using NoSoliciting.Interface;
|
|
|
|
namespace NoSoliciting {
|
|
public partial class Filter {
|
|
private static class PartyFinder {
|
|
public static bool MatchesCustomFilters(string msg, PluginConfiguration config) {
|
|
if (config == null) {
|
|
throw new ArgumentNullException(nameof(config), "PluginConfiguration cannot be null");
|
|
}
|
|
|
|
if (!config.CustomPFFilter) {
|
|
return false;
|
|
}
|
|
|
|
msg = NoSolUtil.Normalise(msg);
|
|
|
|
return config.PFSubstrings.Any(needle => msg.ContainsIgnoreCase(needle))
|
|
|| config.CompiledPFRegexes.Any(needle => needle.IsMatch(msg));
|
|
}
|
|
}
|
|
}
|
|
}
|