feat(trainer): add import feature

This commit is contained in:
Anna 2021-07-19 16:15:31 -04:00
parent aac3b42b47
commit 21c7e01097
2 changed files with 13 additions and 2 deletions

View File

@ -11,6 +11,7 @@
<PackageReference Include="ConsoleTables" Version="2.4.2"/> <PackageReference Include="ConsoleTables" Version="2.4.2"/>
<PackageReference Include="CsvHelper" Version="27.1.1"/> <PackageReference Include="CsvHelper" Version="27.1.1"/>
<PackageReference Include="Microsoft.ML" Version="1.6.0"/> <PackageReference Include="Microsoft.ML" Version="1.6.0"/>
<PackageReference Include="MimeKitLite" Version="2.13.0"/>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>

View File

@ -12,6 +12,7 @@ using CsvHelper.Configuration;
using Microsoft.ML; using Microsoft.ML;
using Microsoft.ML.Data; using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text; using Microsoft.ML.Transforms.Text;
using MimeKit;
using Newtonsoft.Json; using Newtonsoft.Json;
using Newtonsoft.Json.Serialization; using Newtonsoft.Json.Serialization;
using NoSoliciting.Interface; using NoSoliciting.Interface;
@ -60,9 +61,12 @@ namespace NoSoliciting.Trainer {
private static void Import(string path) { private static void Import(string path) {
var allData = new List<Data>(); var allData = new List<Data>();
var opts = new ParserOptions {
CharsetEncoding = Encoding.UTF8,
};
foreach (var emlPath in Directory.GetFiles(path, "*.eml")) { foreach (var emlPath in Directory.GetFiles(path, "*.eml")) {
var lines = File.ReadAllLines(emlPath); var message = MimeMessage.Load(opts, new FileStream(emlPath, FileMode.Open));
var json = lines.FirstOrDefault(line => line.StartsWith("JSON: ")); var json = message.TextBody.Split('\r', '\n').FirstOrDefault(line => line.StartsWith("JSON: "));
if (json == null) { if (json == null) {
continue; continue;
} }
@ -73,16 +77,22 @@ namespace NoSoliciting.Trainer {
var data = new Data(report.Type, content) { var data = new Data(report.Type, content) {
Category = report.SuggestedClassification, Category = report.SuggestedClassification,
}; };
data.Message = data.Message
.Replace("\r\n", " ")
.Replace('\r', ' ')
.Replace('\n', ' ');
allData.Add(data); allData.Add(data);
} }
var writer = new StringWriter(); var writer = new StringWriter();
using var csv = new CsvWriter(writer, new CsvConfiguration(CultureInfo.InvariantCulture) { using var csv = new CsvWriter(writer, new CsvConfiguration(CultureInfo.InvariantCulture) {
HeaderValidated = null, HeaderValidated = null,
Encoding = Encoding.UTF8,
}); });
csv.WriteRecords(allData csv.WriteRecords(allData
.OrderBy(data => data.Channel) .OrderBy(data => data.Channel)
.ThenBy(data => data.Message)); .ThenBy(data => data.Message));
Console.OutputEncoding = Encoding.UTF8;
Console.WriteLine(writer.ToString()); Console.WriteLine(writer.ToString());
} }