feat: better handle puncutation

Certain symbols are turned into one space so the model sees multiple
words instead of one. Previously "[RP]Hi" would turn into "RPHi" and
be its own token. Now it turns into "RP" and "Hi", counting as two
tokens. This change increased the model's accuracy.

Also make "18", "http", "https", and LGBT-related words into stop
words (meaning they're ignored). Each of these stop words made the
model more accurate and reduced unwanted bias.

Messages destined for ML are now normalised by the plugin in the same
way the model's input is for training. This should make the results
come closer to expected.
This commit is contained in:
Anna 2021-02-17 20:00:35 -05:00
parent e8678f3c7d
commit d00b3b0845
10 changed files with 44 additions and 116 deletions

View File

@ -1,9 +1,10 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace NoSoliciting.Trainer {
public static class Util {
namespace NoSoliciting.Interface {
public static class NoSolUtil {
private static readonly Dictionary<char, string> Replacements = new() {
// numerals
['\ue055'] = "1",
@ -46,7 +47,21 @@ namespace NoSoliciting.Trainer {
private const char LowestReplacement = '\ue022';
public static string Normalise(string input) {
private static readonly char[] SpaceSymbols = {
'/', '|',
'(', ')',
'[', ']',
'<', '>',
'=', '+',
'.', ',',
'~', '-',
};
private static string Spacify(string input) {
return SpaceSymbols.Aggregate(input, (current, sym) => current.Replace(sym, ' '));
}
public static string Normalise(string input, bool spacify = false) {
if (input == null) {
throw new ArgumentNullException(nameof(input), "input cannot be null");
}
@ -95,7 +110,10 @@ namespace NoSoliciting.Trainer {
input = builder.ToString();
// NFKD unicode normalisation
return input.Normalize(NormalizationForm.FormKD);
var normalised = input.Normalize(NormalizationForm.FormKD);
// replace several symbols with spaces instead
return spacify ? Spacify(normalised) : normalised;
}
}
}

0
NoSoliciting.Interface/NoSoliciting.Interface.csproj Normal file → Executable file
View File

View File

@ -14,6 +14,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\NoSoliciting.Interface\NoSoliciting.Interface.csproj" />
<ProjectReference Include="..\NoSoliciting.Internal.Interface\NoSoliciting.Internal.Interface.csproj" />
</ItemGroup>

View File

@ -9,6 +9,7 @@ using CsvHelper.Configuration;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;
using NoSoliciting.Interface;
using NoSoliciting.Internal.Interface;
namespace NoSoliciting.Trainer {
@ -45,7 +46,7 @@ namespace NoSoliciting.Trainer {
foreach (var record in records) {
// normalise the message
record.Message = Util.Normalise(record.Message);
record.Message = NoSolUtil.Normalise(record.Message, true);
// keep track of how many message of each category we have
if (!classes.ContainsKey(record.Category!)) {
@ -79,19 +80,15 @@ namespace NoSoliciting.Trainer {
.Append(ctx.Transforms.CustomMapping(compute.GetMapping(), "Compute"))
.Append(ctx.Transforms.Text.NormalizeText("MsgNormal", nameof(Data.Message), keepPunctuations: false))
.Append(ctx.Transforms.Text.TokenizeIntoWords("MsgTokens", "MsgNormal"))
// .Append(ctx.Transforms.Text.RemoveStopWords("MsgNoStop", "MsgTokens",
// "the",
// "a",
// "of",
// "in",
// "for",
// "from",
// "and",
// "discord"
// ))
.Append(ctx.Transforms.Text.RemoveDefaultStopWords("MsgNoDefStop", "MsgTokens"))
.Append(ctx.Transforms.Text.RemoveStopWords("MsgNoStop", "MsgNoDefStop",
"discord"
"discord",
"lgbt",
"lgbtq",
"lgbtqia",
"http",
"https",
"18"
))
.Append(ctx.Transforms.Conversion.MapValueToKey("MsgKey", "MsgNoStop"))
.Append(ctx.Transforms.Text.ProduceNgrams("MsgNgrams", "MsgKey", weighting: NgramExtractingEstimator.WeightingCriteria.Tf))
@ -167,7 +164,7 @@ namespace NoSoliciting.Trainer {
var input = new Data {
Channel = channel,
// PartyFinder = channel == 0,
Message = parts[1],
Message = NoSolUtil.Normalise(parts[1], true),
};
var pred = predEngine.Predict(input);

View File

@ -128,6 +128,7 @@ FC,11, Hey Cactuar! Has your Free Company already joined the <CA> ? Are you a
FC,13,"<>>Eclipse would like you to join a - Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡."
FC,13,"Hello!;-) I've just send u an invite to join Artemis Moonlight. Join us if u like! We are a friendly and helpful FC, willing to help all level players and for fun teamplay. Kupo!"
FC,13,"<<>> Eclipsehey what about joining a  fc full of  people? Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡."
NORMAL,0," Its taking to long to get a party"
NORMAL,0,[7/8 static] CLEAR FOR 1 LET'S GOOOOOOO -- Happy Brambles | T/H N/E DPS S/W | Discord available
NORMAL,0,"50 - 200, Aetherpool 99/99 §Fast Key strat§ ¦Discord will be available at Floor 151 - 200¦ «Be experienced or cleared»"
NORMAL,0,"<tiles> prog | Happy Brambles | TH N/E, DPS S/W | Boss relative, T/H CW | Tiles start cardinals"
@ -235,6 +236,7 @@ NORMAL,0,In the name of the empire come BROTHERS come kill with me!!
NORMAL,0,Joe Biden. Joe Biden. Joe Biden. Midruda - znail - automarkers - 2-2-1-1 - party SE BOOM
NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
NORMAL,0,Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
NORMAL,0,"Join for an invite to a CWLS for SHB Relics. Clusters+fragments farms for memories, and HW FATE farming for memories. "
NORMAL,0,"Join or /tell to get an ""OOF"""
NORMAL,0,Join us for weekly Rival Wings runs! Next scheduled time is Saturday at 8PM EST! https://discord.gg/revivalwings
@ -274,6 +276,7 @@ NORMAL,0,Looking to sell medium odder otter walls(2mil) join or tell.
NORMAL,0,Mermaid Themed Character Art! Come join at the chocokeep in Gridania on Siren: https://www.twitch.tv/kikilove135
NORMAL,0,mount farming party !!!! come join! down to help others!
NORMAL,0,need of new friends c: discord: nine#0069
NORMAL,0,Need one DPS for e9s reclear and e10s prog! Discord required. No salt and plenty of chaos!
NORMAL,0,Needing whm glam Come chill with me
NORMAL,0,"New House! Please visit and leave feedback. ^-^ (*AFK*) Gilgamesh, Shiro, Ward 23, plot 28"
NORMAL,0,New learning any help much apprectiated
@ -2059,8 +2062,7 @@ RP,0,RP))Koko Tumen Taashaal is open tomorrow night at 9pm ! Come join us for a
RP,0,"Salt and Sprite Bar & Lounge is open for business! Come on over for drinks and relaxation! Cactuar, Shirogane, W15, P37, R18"
RP,0,Santa's Workshop is open for viewing...come enjoy photo shoots with your fc on Gilgamesh LB Ward 7 plot 6
RP,0,Selling Femroe thigh crushes and other services <3 join party or send tell for more info ;)
RP,0,"SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise
WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS!"
RP,0,"SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS!"
RP,0,"Seven Circles is open tonight! Enjoy Live Music Songbirds!, Drinks and Devilish Company! 9pm-1am Sarg Mist 9-35!"
RP,0,single Hyur Summoner looking for her daddy ! pm or join party if interested XD
RP,0,SPAGHETTI WESTERN NIGHT AT SPAGET 2112! Free cowboy hats! Whiskey provided by the Whiskey Tears! Gilga Mist W21 P12

1 Category Channel Message
128 FC 13 <>>Eclipse would like you to join a - Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡.
129 FC 13 Hello!;-) I've just send u an invite to join Artemis Moonlight. Join us if u like! We are a friendly and helpful FC, willing to help all level players and for fun teamplay. Kupo!
130 FC 13 <<>> Eclipsehey what about joining a  fc full of  people? Low & High end Casual FC (24/7buffs). Accept, Decline or PM me♡.
131 NORMAL 0 Its taking to long to get a party
132 NORMAL 0 [7/8 static] CLEAR FOR 1 LET'S GOOOOOOO -- Happy Brambles | T/H N/E DPS S/W | Discord available
133 NORMAL 0 ‹50 - 200, Aetherpool 99/99› §Fast Key strat§ ¦Discord will be available at Floor 151 - 200¦ «Be experienced or cleared»
134 NORMAL 0 <tiles> prog | Happy Brambles | TH N/E, DPS S/W | Boss relative, T/H CW | Tiles start cardinals
236 NORMAL 0 Joe Biden. Joe Biden. Joe Biden. Midruda - znail - automarkers - 2-2-1-1 - party SE BOOM
237 NORMAL 0 Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
238 NORMAL 0 Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
239 NORMAL 0 Join Eorzea Multiverse! An 18+ discord for people looking for friends or relationships! https://discord.gg/d2nQSgk8j4
240 NORMAL 0 Join for an invite to a CWLS for SHB Relics. Clusters+fragments farms for memories, and HW FATE farming for memories.
241 NORMAL 0 Join or /tell to get an "OOF"
242 NORMAL 0 Join us for weekly Rival Wings runs! Next scheduled time is Saturday at 8PM EST! https://discord.gg/revivalwings
276 NORMAL 0 Mermaid Themed Character Art! Come join at the chocokeep in Gridania on Siren: https://www.twitch.tv/kikilove135
277 NORMAL 0 mount farming party !!!! come join! down to help others!
278 NORMAL 0 need of new friends c: discord: nine#0069
279 NORMAL 0 Need one DPS for e9s reclear and e10s prog! Discord required. No salt and plenty of chaos!
280 NORMAL 0 Needing whm glam Come chill with me
281 NORMAL 0 New House! Please visit and leave feedback. ^-^ (*AFK*) Gilgamesh, Shiro, Ward 23, plot 28
282 NORMAL 0 New learning any help much apprectiated
2062 RP 0 Salt and Sprite Bar & Lounge is open for business! Come on over for drinks and relaxation! Cactuar, Shirogane, W15, P37, R18
2063 RP 0 Santa's Workshop is open for viewing...come enjoy photo shoots with your fc on Gilgamesh LB Ward 7 plot 6
2064 RP 0 Selling Femroe thigh crushes and other services <3 join party or send tell for more info ;)
2065 RP 0 SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS! SELLING LEMONADE , Ward 16 Plot 35 Goblet Adamantoise WE HAVE A POOL PARTY GOING ON TOO AND CAT GIRLS!
RP 0 Seven Circles is open tonight! Enjoy Live Music Songbirds!, Drinks and Devilish Company! 9pm-1am Sarg Mist 9-35!
2066 RP 0 single Hyur Summoner looking for her daddy ! pm or join party if interested XD Seven Circles is open tonight! Enjoy Live Music Songbirds!, Drinks and Devilish Company! 9pm-1am Sarg Mist 9-35!
2067 RP 0 SPAGHETTI WESTERN NIGHT AT SPAGET 2112! Free cowboy hats! Whiskey provided by the Whiskey Tears! Gilga Mist W21 P12 single Hyur Summoner looking for her daddy ! pm or join party if interested XD
2068 RP 0 Sugar Sugar Hime Lounge is hosting a winter raffle! Drinks are on the house <3 Faerie - Goblet - W14 P35 discord.gg/HRpZ54m SPAGHETTI WESTERN NIGHT AT SPAGET 2112! Free cowboy hats! Whiskey provided by the Whiskey Tears! Gilga Mist W21 P12

View File

@ -8,6 +8,7 @@ using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Dalamud.Game.Chat;
using Dalamud.Plugin;
using NoSoliciting.Interface;
using NoSoliciting.Properties;
using YamlDotNet.Core;
using YamlDotNet.Core.Events;
@ -195,7 +196,7 @@ namespace NoSoliciting {
}
if (this.Normalise) {
text = FilterUtil.Normalise(text);
text = NoSolUtil.Normalise(text);
}
if (this.IgnoreCase) {

View File

@ -1,5 +1,6 @@
using System;
using System.Linq;
using NoSoliciting.Interface;
namespace NoSoliciting {
public partial class Filter {
@ -13,7 +14,7 @@ namespace NoSoliciting {
return false;
}
msg = FilterUtil.Normalise(msg);
msg = NoSolUtil.Normalise(msg);
return config.ChatSubstrings.Any(needle => msg.ContainsIgnoreCase(needle))
|| config.CompiledChatRegexes.Any(needle => needle.IsMatch(msg));

View File

@ -1,5 +1,6 @@
using System;
using System.Linq;
using NoSoliciting.Interface;
namespace NoSoliciting {
public partial class Filter {
@ -13,7 +14,7 @@ namespace NoSoliciting {
return false;
}
msg = FilterUtil.Normalise(msg);
msg = NoSolUtil.Normalise(msg);
return config.PFSubstrings.Any(needle => msg.ContainsIgnoreCase(needle))
|| config.CompiledPFRegexes.Any(needle => needle.IsMatch(msg));

View File

@ -8,100 +8,6 @@ using System.Text;
namespace NoSoliciting {
public static class FilterUtil {
private static readonly Dictionary<char, string> Replacements = new() {
// numerals
['\ue055'] = "1",
['\ue056'] = "2",
['\ue057'] = "3",
['\ue058'] = "4",
['\ue059'] = "5",
['\ue099'] = "10",
['\ue09a'] = "11",
['\ue09b'] = "12",
['\ue09c'] = "13",
['\ue09d'] = "14",
['\ue09e'] = "15",
['\ue09f'] = "16",
['\ue0a0'] = "17",
['\ue0a1'] = "18",
['\ue0a2'] = "19",
['\ue0a3'] = "20",
['\ue0a4'] = "21",
['\ue0a5'] = "22",
['\ue0a6'] = "23",
['\ue0a7'] = "24",
['\ue0a8'] = "25",
['\ue0a9'] = "26",
['\ue0aa'] = "27",
['\ue0ab'] = "28",
['\ue0ac'] = "29",
['\ue0ad'] = "30",
['\ue0ae'] = "31",
// symbols
['\ue0af'] = "+",
['\ue070'] = "?",
// letters in other sets
['\ue022'] = "A",
['\ue024'] = "_A",
['\ue0b0'] = "E",
};
private const char LowestReplacement = '\ue022';
public static string Normalise(string input) {
if (input == null) {
throw new ArgumentNullException(nameof(input), "input cannot be null");
}
// replace ffxiv private use chars
var builder = new StringBuilder(input.Length);
foreach (var c in input) {
if (c < LowestReplacement) {
goto AppendNormal;
}
// alphabet
if (c >= 0xe071 && c <= 0xe08a) {
builder.Append((char) (c - 0xe030));
continue;
}
// 0 to 9
if (c >= 0xe060 && c <= 0xe069) {
builder.Append((char) (c - 0xe030));
continue;
}
// 1 to 9
if (c >= 0xe0b1 && c <= 0xe0b9) {
builder.Append((char) (c - 0xe080));
continue;
}
// 1 to 9 again
if (c >= 0xe090 && c <= 0xe098) {
builder.Append((char) (c - 0xe05f));
continue;
}
// replacements in map
if (Replacements.TryGetValue(c, out var rep)) {
builder.Append(rep);
continue;
}
AppendNormal:
builder.Append(c);
}
input = builder.ToString();
// NFKD unicode normalisation
return input.Normalize(NormalizationForm.FormKD);
}
private static int MaxItemLevel { get; set; }
private enum Slot {

View File

@ -37,6 +37,7 @@ namespace NoSoliciting.Ml {
}
public MessageCategory ClassifyMessage(ushort channel, string message) {
message = NoSolUtil.Normalise(message, true);
var prediction = this.Classifier.InvokeAsync(classifier => classifier.Classify(channel, message)).Result;
var category = MessageCategoryExt.FromString(prediction);