From d00b3b08458b162aa9e6b6bb4826ec5eaeb13701 Mon Sep 17 00:00:00 2001 From: Anna Date: Wed, 17 Feb 2021 20:00:35 -0500 Subject: [PATCH] feat: better handle puncutation Certain symbols are turned into one space so the model sees multiple words instead of one. Previously "[RP]Hi" would turn into "RPHi" and be its own token. Now it turns into "RP" and "Hi", counting as two tokens. This change increased the model's accuracy. Also make "18", "http", "https", and LGBT-related words into stop words (meaning they're ignored). Each of these stop words made the model more accurate and reduced unwanted bias. Messages destined for ML are now normalised by the plugin in the same way the model's input is for training. This should make the results come closer to expected. --- .../NoSolUtil.cs | 26 ++++- .../NoSoliciting.Interface.csproj | 0 .../NoSoliciting.Trainer.csproj | 1 + NoSoliciting.Trainer/Program.cs | 23 ++--- NoSoliciting.Trainer/data.csv | 6 +- NoSoliciting/Definitions.cs | 3 +- NoSoliciting/Filter.Chat.cs | 3 +- NoSoliciting/Filter.PartyFinder.cs | 3 +- NoSoliciting/FilterUtil.cs | 94 ------------------- NoSoliciting/Ml/MlFilter.cs | 1 + 10 files changed, 44 insertions(+), 116 deletions(-) rename NoSoliciting.Trainer/Util.cs => NoSoliciting.Interface/NoSolUtil.cs (78%) mode change 100644 => 100755 mode change 100644 => 100755 NoSoliciting.Interface/NoSoliciting.Interface.csproj diff --git a/NoSoliciting.Trainer/Util.cs b/NoSoliciting.Interface/NoSolUtil.cs old mode 100644 new mode 100755 similarity index 78% rename from NoSoliciting.Trainer/Util.cs rename to NoSoliciting.Interface/NoSolUtil.cs index 4ca2a55..57fea76 --- a/NoSoliciting.Trainer/Util.cs +++ b/NoSoliciting.Interface/NoSolUtil.cs @@ -1,9 +1,10 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text; -namespace NoSoliciting.Trainer { - public static class Util { +namespace NoSoliciting.Interface { + public static class NoSolUtil { private static readonly Dictionary Replacements = new() { // numerals ['\ue055'] = "1", @@ -46,7 +47,21 @@ namespace NoSoliciting.Trainer { private const char LowestReplacement = '\ue022'; - public static string Normalise(string input) { + private static readonly char[] SpaceSymbols = { + '/', '|', + '(', ')', + '[', ']', + '<', '>', + '=', '+', + '.', ',', + '~', '-', + }; + + private static string Spacify(string input) { + return SpaceSymbols.Aggregate(input, (current, sym) => current.Replace(sym, ' ')); + } + + public static string Normalise(string input, bool spacify = false) { if (input == null) { throw new ArgumentNullException(nameof(input), "input cannot be null"); } @@ -95,7 +110,10 @@ namespace NoSoliciting.Trainer { input = builder.ToString(); // NFKD unicode normalisation - return input.Normalize(NormalizationForm.FormKD); + var normalised = input.Normalize(NormalizationForm.FormKD); + + // replace several symbols with spaces instead + return spacify ? Spacify(normalised) : normalised; } } } diff --git a/NoSoliciting.Interface/NoSoliciting.Interface.csproj b/NoSoliciting.Interface/NoSoliciting.Interface.csproj old mode 100644 new mode 100755 diff --git a/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj b/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj index 7373dbd..9336d48 100755 --- a/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj +++ b/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj @@ -14,6 +14,7 @@ + diff --git a/NoSoliciting.Trainer/Program.cs b/NoSoliciting.Trainer/Program.cs index 8f98d29..a03e383 100644 --- a/NoSoliciting.Trainer/Program.cs +++ b/NoSoliciting.Trainer/Program.cs @@ -9,6 +9,7 @@ using CsvHelper.Configuration; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms.Text; +using NoSoliciting.Interface; using NoSoliciting.Internal.Interface; namespace NoSoliciting.Trainer { @@ -45,7 +46,7 @@ namespace NoSoliciting.Trainer { foreach (var record in records) { // normalise the message - record.Message = Util.Normalise(record.Message); + record.Message = NoSolUtil.Normalise(record.Message, true); // keep track of how many message of each category we have if (!classes.ContainsKey(record.Category!)) { @@ -79,19 +80,15 @@ namespace NoSoliciting.Trainer { .Append(ctx.Transforms.CustomMapping(compute.GetMapping(), "Compute")) .Append(ctx.Transforms.Text.NormalizeText("MsgNormal", nameof(Data.Message), keepPunctuations: false)) .Append(ctx.Transforms.Text.TokenizeIntoWords("MsgTokens", "MsgNormal")) - // .Append(ctx.Transforms.Text.RemoveStopWords("MsgNoStop", "MsgTokens", - // "the", - // "a", - // "of", - // "in", - // "for", - // "from", - // "and", - // "discord" - // )) .Append(ctx.Transforms.Text.RemoveDefaultStopWords("MsgNoDefStop", "MsgTokens")) .Append(ctx.Transforms.Text.RemoveStopWords("MsgNoStop", "MsgNoDefStop", - "discord" + "discord", + "lgbt", + "lgbtq", + "lgbtqia", + "http", + "https", + "18" )) .Append(ctx.Transforms.Conversion.MapValueToKey("MsgKey", "MsgNoStop")) .Append(ctx.Transforms.Text.ProduceNgrams("MsgNgrams", "MsgKey", weighting: NgramExtractingEstimator.WeightingCriteria.Tf)) @@ -167,7 +164,7 @@ namespace NoSoliciting.Trainer { var input = new Data { Channel = channel, // PartyFinder = channel == 0, - Message = parts[1], + Message = NoSolUtil.Normalise(parts[1], true), }; var pred = predEngine.Predict(input); diff --git a/NoSoliciting.Trainer/data.csv b/NoSoliciting.Trainer/data.csv index 8a5e14c..4025677 100755 --- a/NoSoliciting.Trainer/data.csv +++ b/NoSoliciting.Trainer/data.csv @@ -128,6 +128,7 @@ FC,11, Hey Cactuar! Has your Free Company already joined the ? Are you a FC,13,"<>>Eclipse would like you to join a - Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡." FC,13,"Hello!;-) I've just send u an invite to join Artemis Moonlight. Join us if u like! We are a friendly and helpful FC, willing to help all level players and for fun teamplay. Kupo!" FC,13,"<<>> Eclipsehey what about joining a  fc full of  people? Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡." +NORMAL,0," Its taking to long to get a party" NORMAL,0,[7/8 static] CLEAR FOR 1 LET'S GOOOOOOO -- Happy Brambles | T/H N/E DPS S/W | Discord available NORMAL,0,"‹50 - 200, Aetherpool 99/99› §Fast Key strat§ ¦Discord will be available at Floor 151 - 200¦ «Be experienced or cleared»" NORMAL,0," prog | Happy Brambles | TH N/E, DPS S/W | Boss relative, T/H CW | Tiles start cardinals" @@ -235,6 +236,7 @@ NORMAL,0,In the name of the empire come BROTHERS come kill with me!! NORMAL,0,Joe Biden. Joe Biden. Joe Biden. Midruda - znail - automarkers - 2-2-1-1 - party SE BOOM NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4 NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4 +NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4 NORMAL,0,"Join for an invite to a CWLS for SHB Relics. Clusters+fragments farms for memories, and HW FATE farming for memories. " NORMAL,0,"Join or /tell to get an ""OOF""" NORMAL,0,Join us for weekly Rival Wings runs! Next scheduled time is Saturday at 8PM EST! https://discord.gg/revivalwings @@ -274,6 +276,7 @@ NORMAL,0,Looking to sell medium odder otter walls(2mil) join or tell. NORMAL,0,Mermaid Themed Character Art! Come join at the chocokeep in Gridania on Siren: https://www.twitch.tv/kikilove135 NORMAL,0,mount farming party !!!! come join! down to help others! NORMAL,0,need of new friends c: discord: nine#0069 +NORMAL,0,Need one DPS for e9s reclear and e10s prog! Discord required. No salt and plenty of chaos! NORMAL,0,Needing whm glam Come chill with me NORMAL,0,"New House! Please visit and leave feedback. ^-^ (*AFK*) Gilgamesh, Shiro, Ward 23, plot 28" NORMAL,0,New learning any help much apprectiated @@ -2059,8 +2062,7 @@ RP,0,RP))Koko Tumen Taashaal is open tomorrow night at 9pm ! Come join us for a RP,0,"Salt and Sprite Bar & Lounge is open for business! Come on over for drinks and relaxation! Cactuar, Shirogane, W15, P37, R18" RP,0,Santa's Workshop is open for viewing...come enjoy photo shoots with your fc on Gilgamesh LB Ward 7 plot 6 RP,0,Selling Femroe thigh crushes and other services <3 join party or send tell for more info ;) -RP,0,"SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise -WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS!" +RP,0,"SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS!" RP,0,"Seven Circles is open tonight! Enjoy Live Music Songbirds!, Drinks and Devilish Company! 9pm-1am Sarg Mist 9-35!" RP,0,single Hyur Summoner looking for her daddy ! pm or join party if interested XD RP,0,SPAGHETTI WESTERN NIGHT AT SPAGET 2112! Free cowboy hats! Whiskey provided by the Whiskey Tears! Gilga Mist W21 P12 diff --git a/NoSoliciting/Definitions.cs b/NoSoliciting/Definitions.cs index dde5624..e556a96 100644 --- a/NoSoliciting/Definitions.cs +++ b/NoSoliciting/Definitions.cs @@ -8,6 +8,7 @@ using System.Text.RegularExpressions; using System.Threading.Tasks; using Dalamud.Game.Chat; using Dalamud.Plugin; +using NoSoliciting.Interface; using NoSoliciting.Properties; using YamlDotNet.Core; using YamlDotNet.Core.Events; @@ -195,7 +196,7 @@ namespace NoSoliciting { } if (this.Normalise) { - text = FilterUtil.Normalise(text); + text = NoSolUtil.Normalise(text); } if (this.IgnoreCase) { diff --git a/NoSoliciting/Filter.Chat.cs b/NoSoliciting/Filter.Chat.cs index 469e3df..c3863ae 100644 --- a/NoSoliciting/Filter.Chat.cs +++ b/NoSoliciting/Filter.Chat.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using NoSoliciting.Interface; namespace NoSoliciting { public partial class Filter { @@ -13,7 +14,7 @@ namespace NoSoliciting { return false; } - msg = FilterUtil.Normalise(msg); + msg = NoSolUtil.Normalise(msg); return config.ChatSubstrings.Any(needle => msg.ContainsIgnoreCase(needle)) || config.CompiledChatRegexes.Any(needle => needle.IsMatch(msg)); diff --git a/NoSoliciting/Filter.PartyFinder.cs b/NoSoliciting/Filter.PartyFinder.cs index d343ff9..f9ecb35 100644 --- a/NoSoliciting/Filter.PartyFinder.cs +++ b/NoSoliciting/Filter.PartyFinder.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using NoSoliciting.Interface; namespace NoSoliciting { public partial class Filter { @@ -13,7 +14,7 @@ namespace NoSoliciting { return false; } - msg = FilterUtil.Normalise(msg); + msg = NoSolUtil.Normalise(msg); return config.PFSubstrings.Any(needle => msg.ContainsIgnoreCase(needle)) || config.CompiledPFRegexes.Any(needle => needle.IsMatch(msg)); diff --git a/NoSoliciting/FilterUtil.cs b/NoSoliciting/FilterUtil.cs index 0c8bbdc..652e847 100644 --- a/NoSoliciting/FilterUtil.cs +++ b/NoSoliciting/FilterUtil.cs @@ -8,100 +8,6 @@ using System.Text; namespace NoSoliciting { public static class FilterUtil { - private static readonly Dictionary Replacements = new() { - // numerals - ['\ue055'] = "1", - ['\ue056'] = "2", - ['\ue057'] = "3", - ['\ue058'] = "4", - ['\ue059'] = "5", - ['\ue099'] = "10", - ['\ue09a'] = "11", - ['\ue09b'] = "12", - ['\ue09c'] = "13", - ['\ue09d'] = "14", - ['\ue09e'] = "15", - ['\ue09f'] = "16", - ['\ue0a0'] = "17", - ['\ue0a1'] = "18", - ['\ue0a2'] = "19", - ['\ue0a3'] = "20", - ['\ue0a4'] = "21", - ['\ue0a5'] = "22", - ['\ue0a6'] = "23", - ['\ue0a7'] = "24", - ['\ue0a8'] = "25", - ['\ue0a9'] = "26", - ['\ue0aa'] = "27", - ['\ue0ab'] = "28", - ['\ue0ac'] = "29", - ['\ue0ad'] = "30", - ['\ue0ae'] = "31", - - // symbols - ['\ue0af'] = "+", - ['\ue070'] = "?", - - // letters in other sets - ['\ue022'] = "A", - ['\ue024'] = "_A", - ['\ue0b0'] = "E", - }; - - private const char LowestReplacement = '\ue022'; - - public static string Normalise(string input) { - if (input == null) { - throw new ArgumentNullException(nameof(input), "input cannot be null"); - } - - // replace ffxiv private use chars - var builder = new StringBuilder(input.Length); - foreach (var c in input) { - if (c < LowestReplacement) { - goto AppendNormal; - } - - // alphabet - if (c >= 0xe071 && c <= 0xe08a) { - builder.Append((char) (c - 0xe030)); - continue; - } - - // 0 to 9 - if (c >= 0xe060 && c <= 0xe069) { - builder.Append((char) (c - 0xe030)); - continue; - } - - // 1 to 9 - if (c >= 0xe0b1 && c <= 0xe0b9) { - builder.Append((char) (c - 0xe080)); - continue; - } - - // 1 to 9 again - if (c >= 0xe090 && c <= 0xe098) { - builder.Append((char) (c - 0xe05f)); - continue; - } - - // replacements in map - if (Replacements.TryGetValue(c, out var rep)) { - builder.Append(rep); - continue; - } - - AppendNormal: - builder.Append(c); - } - - input = builder.ToString(); - - // NFKD unicode normalisation - return input.Normalize(NormalizationForm.FormKD); - } - private static int MaxItemLevel { get; set; } private enum Slot { diff --git a/NoSoliciting/Ml/MlFilter.cs b/NoSoliciting/Ml/MlFilter.cs index 9bb56aa..6ab0247 100644 --- a/NoSoliciting/Ml/MlFilter.cs +++ b/NoSoliciting/Ml/MlFilter.cs @@ -37,6 +37,7 @@ namespace NoSoliciting.Ml { } public MessageCategory ClassifyMessage(ushort channel, string message) { + message = NoSolUtil.Normalise(message, true); var prediction = this.Classifier.InvokeAsync(classifier => classifier.Classify(channel, message)).Result; var category = MessageCategoryExt.FromString(prediction);