diff --git a/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj b/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj index dfd6bfa..69b6e19 100755 --- a/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj +++ b/NoSoliciting.Trainer/NoSoliciting.Trainer.csproj @@ -11,6 +11,7 @@ + diff --git a/NoSoliciting.Trainer/Program.cs b/NoSoliciting.Trainer/Program.cs index 3e68536..f9a81ce 100644 --- a/NoSoliciting.Trainer/Program.cs +++ b/NoSoliciting.Trainer/Program.cs @@ -12,6 +12,7 @@ using CsvHelper.Configuration; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Transforms.Text; +using MimeKit; using Newtonsoft.Json; using Newtonsoft.Json.Serialization; using NoSoliciting.Interface; @@ -60,9 +61,12 @@ namespace NoSoliciting.Trainer { private static void Import(string path) { var allData = new List(); + var opts = new ParserOptions { + CharsetEncoding = Encoding.UTF8, + }; foreach (var emlPath in Directory.GetFiles(path, "*.eml")) { - var lines = File.ReadAllLines(emlPath); - var json = lines.FirstOrDefault(line => line.StartsWith("JSON: ")); + var message = MimeMessage.Load(opts, new FileStream(emlPath, FileMode.Open)); + var json = message.TextBody.Split('\r', '\n').FirstOrDefault(line => line.StartsWith("JSON: ")); if (json == null) { continue; } @@ -73,16 +77,22 @@ namespace NoSoliciting.Trainer { var data = new Data(report.Type, content) { Category = report.SuggestedClassification, }; + data.Message = data.Message + .Replace("\r\n", " ") + .Replace('\r', ' ') + .Replace('\n', ' '); allData.Add(data); } var writer = new StringWriter(); using var csv = new CsvWriter(writer, new CsvConfiguration(CultureInfo.InvariantCulture) { HeaderValidated = null, + Encoding = Encoding.UTF8, }); csv.WriteRecords(allData .OrderBy(data => data.Channel) .ThenBy(data => data.Message)); + Console.OutputEncoding = Encoding.UTF8; Console.WriteLine(writer.ToString()); }