From cd042e53681176a604640d9069b169f1f95914ed Mon Sep 17 00:00:00 2001 From: Alexey Golub Date: Sun, 15 Sep 2019 21:24:07 +0300 Subject: [PATCH] Rework markdown parser and improve its performance for non-HTML formats --- .../Internal/AggregateMatcher.cs | 8 +- .../Internal/Extensions.cs | 40 +++++---- .../Internal/IMatcher.cs | 2 +- .../Internal/ParsedMatch.cs | 9 +- .../Internal/RegexMatcher.cs | 31 +++++-- .../Internal/StringMatcher.cs | 19 ++-- .../Internal/StringPart.cs | 28 ++++++ .../MarkdownParser.cs | 86 +++++++++++-------- .../Nodes/EmojiNode.cs | 7 +- .../Nodes/FormattedNode.cs | 6 +- .../Nodes/InlineCodeBlockNode.cs | 3 +- .../Nodes/LinkNode.cs | 6 +- .../Nodes/MentionNode.cs | 3 +- ...BlockNode.cs => MultiLineCodeBlockNode.cs} | 5 +- .../Nodes/Node.cs | 6 -- .../Nodes/TextNode.cs | 7 +- .../CsvChatLogRenderer.cs | 31 ++++--- .../HtmlChatLogRenderer.cs | 10 +-- .../PlainTextChatLogRenderer.cs | 31 ++++--- .../Resources/HtmlShared.css | 2 +- 20 files changed, 201 insertions(+), 139 deletions(-) create mode 100644 DiscordChatExporter.Core.Markdown/Internal/StringPart.cs rename DiscordChatExporter.Core.Markdown/Nodes/{MultilineCodeBlockNode.cs => MultiLineCodeBlockNode.cs} (65%) diff --git a/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs b/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs index 449bcb55..ee540965 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs @@ -16,7 +16,7 @@ namespace DiscordChatExporter.Core.Markdown.Internal { } - public ParsedMatch Match(string input, int startIndex, int length) + public ParsedMatch Match(StringPart stringPart) { ParsedMatch earliestMatch = null; @@ -24,19 +24,19 @@ namespace DiscordChatExporter.Core.Markdown.Internal foreach (var matcher in _matchers) { // Try to match - var match = matcher.Match(input, startIndex, length); + var match = matcher.Match(stringPart); // If there's no match - continue if (match == null) continue; // If this match is earlier than previous earliest - replace - if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex) + if (earliestMatch == null || match.StringPart.StartIndex < earliestMatch.StringPart.StartIndex) earliestMatch = match; // If the earliest match starts at the very beginning - break, // because it's impossible to find a match earlier than that - if (earliestMatch.StartIndex == startIndex) + if (earliestMatch.StringPart.StartIndex == stringPart.StartIndex) break; } diff --git a/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs b/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs index dfff64c9..db8c845a 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs @@ -1,50 +1,54 @@ using System; using System.Collections.Generic; +using System.Text.RegularExpressions; namespace DiscordChatExporter.Core.Markdown.Internal { internal static class Extensions { - public static IEnumerable> MatchAll(this IMatcher matcher, string input, - int startIndex, int length, Func fallbackTransform) - { - // Get end index for simplicity - var endIndex = startIndex + length; + public static StringPart Shrink(this StringPart stringPart, int newStartIndex, int newLength) => + new StringPart(stringPart.Target, newStartIndex, newLength); + public static StringPart Shrink(this StringPart stringPart, int newStartIndex) => + stringPart.Shrink(newStartIndex, stringPart.EndIndex - newStartIndex); + + public static StringPart Shrink(this StringPart stringPart, Capture capture) => + stringPart.Shrink(capture.Index, capture.Length); + + public static IEnumerable> MatchAll(this IMatcher matcher, StringPart stringPart, + Func fallbackTransform) + { // Loop through segments divided by individual matches - var currentIndex = startIndex; - while (currentIndex < endIndex) + var currentIndex = stringPart.StartIndex; + while (currentIndex < stringPart.EndIndex) { // Find a match within this segment - var match = matcher.Match(input, currentIndex, endIndex - currentIndex); + var match = matcher.Match(stringPart.Shrink(currentIndex, stringPart.EndIndex - currentIndex)); // If there's no match - break if (match == null) break; // If this match doesn't start immediately at current index - transform and yield fallback first - if (match.StartIndex > currentIndex) + if (match.StringPart.StartIndex > currentIndex) { - var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex); - yield return new ParsedMatch(currentIndex, fallback.Length, fallbackTransform(fallback)); + var fallbackPart = stringPart.Shrink(currentIndex, match.StringPart.StartIndex - currentIndex); + yield return new ParsedMatch(fallbackPart, fallbackTransform(fallbackPart)); } // Yield match yield return match; // Shift current index to the end of the match - currentIndex = match.StartIndex + match.Length; + currentIndex = match.StringPart.StartIndex + match.StringPart.Length; } // If EOL wasn't reached - transform and yield remaining part as fallback - if (currentIndex < endIndex) + if (currentIndex < stringPart.EndIndex) { - var fallback = input.Substring(currentIndex); - yield return new ParsedMatch(currentIndex, fallback.Length, fallbackTransform(fallback)); + var fallbackPart = stringPart.Shrink(currentIndex); + yield return new ParsedMatch(fallbackPart, fallbackTransform(fallbackPart)); } } - - public static IEnumerable> MatchAll(this IMatcher matcher, string input, - Func fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform); } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs b/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs index fa0c7754..b6c89050 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs @@ -2,6 +2,6 @@ { internal interface IMatcher { - ParsedMatch Match(string input, int startIndex, int length); + ParsedMatch Match(StringPart stringPart); } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs b/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs index b56d2eab..8ebc32ea 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs @@ -2,16 +2,13 @@ { internal class ParsedMatch { - public int StartIndex { get; } - - public int Length { get; } + public StringPart StringPart { get; } public T Value { get; } - public ParsedMatch(int startIndex, int length, T value) + public ParsedMatch(StringPart stringPart, T value) { - StartIndex = startIndex; - Length = length; + StringPart = stringPart; Value = value; } } diff --git a/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs b/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs index 9977410c..ee7cf32f 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs @@ -1,4 +1,8 @@ using System; +using System.Collections.Generic; +using System.Linq; +using System.Linq.Expressions; +using System.Reflection; using System.Text.RegularExpressions; namespace DiscordChatExporter.Core.Markdown.Internal @@ -6,18 +10,35 @@ namespace DiscordChatExporter.Core.Markdown.Internal internal class RegexMatcher : IMatcher { private readonly Regex _regex; - private readonly Func _transform; + private readonly Func _transform; - public RegexMatcher(Regex regex, Func transform) + public RegexMatcher(Regex regex, Func transform) { _regex = regex; _transform = transform; } - public ParsedMatch Match(string input, int startIndex, int length) + public RegexMatcher(Regex regex, Func transform) + : this(regex, (p, m) => transform(m)) { - var match = _regex.Match(input, startIndex, length); - return match.Success ? new ParsedMatch(match.Index, match.Length, _transform(match)) : null; + } + + public ParsedMatch Match(StringPart stringPart) + { + var match = _regex.Match(stringPart.Target, stringPart.StartIndex, stringPart.Length); + if (!match.Success) + return null; + + // Overload regex.Match(string, int, int) doesn't take the whole string into account, + // it effectively functions as a match check on a substring. + // Which is super weird because regex.Match(string, int) takes the whole input in context. + // So in order to properly account for ^/$ regex tokens, we need to make sure that + // the expression also matches on the bigger part of the input. + if (!_regex.IsMatch(stringPart.Target.Substring(0, stringPart.EndIndex), stringPart.StartIndex)) + return null; + + var stringPartShrunk = stringPart.Shrink(match.Index, match.Length); + return new ParsedMatch(stringPartShrunk, _transform(stringPartShrunk, match)); } } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs b/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs index e757d6b4..fbf42901 100644 --- a/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs +++ b/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs @@ -6,24 +6,31 @@ namespace DiscordChatExporter.Core.Markdown.Internal { private readonly string _needle; private readonly StringComparison _comparison; - private readonly Func _transform; + private readonly Func _transform; - public StringMatcher(string needle, StringComparison comparison, Func transform) + public StringMatcher(string needle, StringComparison comparison, Func transform) { _needle = needle; _comparison = comparison; _transform = transform; } - public StringMatcher(string needle, Func transform) + public StringMatcher(string needle, Func transform) : this(needle, StringComparison.Ordinal, transform) { } - public ParsedMatch Match(string input, int startIndex, int length) + public ParsedMatch Match(StringPart stringPart) { - var index = input.IndexOf(_needle, startIndex, length, _comparison); - return index >= 0 ? new ParsedMatch(index, _needle.Length, _transform(_needle)) : null; + var index = stringPart.Target.IndexOf(_needle, stringPart.StartIndex, stringPart.Length, _comparison); + + if (index >= 0) + { + var stringPartShrunk = stringPart.Shrink(index, _needle.Length); + return new ParsedMatch(stringPartShrunk, _transform(stringPartShrunk)); + } + + return null; } } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Internal/StringPart.cs b/DiscordChatExporter.Core.Markdown/Internal/StringPart.cs new file mode 100644 index 00000000..7a1b55f7 --- /dev/null +++ b/DiscordChatExporter.Core.Markdown/Internal/StringPart.cs @@ -0,0 +1,28 @@ +namespace DiscordChatExporter.Core.Markdown.Internal +{ + internal class StringPart + { + public string Target { get; } + + public int StartIndex { get; } + + public int Length { get; } + + public int EndIndex { get; } + + public StringPart(string target, int startIndex, int length) + { + Target = target; + StartIndex = startIndex; + Length = length; + EndIndex = startIndex + length; + } + + public StringPart(string target) + : this(target, 0, target.Length) + { + } + + public override string ToString() => Target.Substring(StartIndex, Length); + } +} \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/MarkdownParser.cs b/DiscordChatExporter.Core.Markdown/MarkdownParser.cs index 5419da46..f49c55a1 100644 --- a/DiscordChatExporter.Core.Markdown/MarkdownParser.cs +++ b/DiscordChatExporter.Core.Markdown/MarkdownParser.cs @@ -1,4 +1,4 @@ -using System.Collections.Generic; +using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; using DiscordChatExporter.Core.Markdown.Internal; @@ -10,94 +10,94 @@ namespace DiscordChatExporter.Core.Markdown // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible public static class MarkdownParser { - private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant; + private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.Multiline; /* Formatting */ // Capture any character until the earliest double asterisk not followed by an asterisk private static readonly IMatcher BoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "**", TextFormatting.Bold, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Bold, Parse(p.Shrink(m.Groups[1])))); // Capture any character until the earliest single asterisk not preceded or followed by an asterisk // Opening asterisk must not be followed by whitespace // Closing asterisk must not be preceded by whitespace private static readonly IMatcher ItalicFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(?!\\s)(.+?)(? new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Shrink(m.Groups[1])))); // Capture any character until the earliest triple asterisk not followed by an asterisk private static readonly IMatcher ItalicBoldFormattedNodeMatcher = new RegexMatcher( new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value, BoldFormattedNodeMatcher))); + (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Shrink(m.Groups[1]), BoldFormattedNodeMatcher))); // Capture any character except underscore until an underscore // Closing underscore must not be followed by a word character private static readonly IMatcher ItalicAltFormattedNodeMatcher = new RegexMatcher( new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Shrink(m.Groups[1])))); // Capture any character until the earliest double underscore not followed by an underscore private static readonly IMatcher UnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "__", TextFormatting.Underline, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Underline, Parse(p.Shrink(m.Groups[1])))); // Capture any character until the earliest triple underscore not followed by an underscore private static readonly IMatcher ItalicUnderlineFormattedNodeMatcher = new RegexMatcher( new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value, UnderlineFormattedNodeMatcher))); + (p, m) => new FormattedNode(TextFormatting.Italic, Parse(p.Shrink(m.Groups[1]), UnderlineFormattedNodeMatcher))); // Capture any character until the earliest double tilde private static readonly IMatcher StrikethroughFormattedNodeMatcher = new RegexMatcher( new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Strikethrough, Parse(p.Shrink(m.Groups[1])))); // Capture any character until the earliest double pipe private static readonly IMatcher SpoilerFormattedNodeMatcher = new RegexMatcher( new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline), - m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, Parse(m.Groups[1].Value))); + (p, m) => new FormattedNode(TextFormatting.Spoiler, Parse(p.Shrink(m.Groups[1])))); /* Code blocks */ // Capture any character except backtick until a backtick - // Whitespace surrounding content inside backticks is trimmed + // Blank lines at the beginning and end of content are trimmed private static readonly IMatcher InlineCodeBlockNodeMatcher = new RegexMatcher( new Regex("`([^`]+)`", DefaultRegexOptions | RegexOptions.Singleline), - m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value.Trim())); + m => new InlineCodeBlockNode(m.Groups[1].Value.Trim('\r', '\n'))); // Capture language identifier and then any character until the earliest triple backtick - // Languge identifier is one word immediately after opening backticks, followed immediately by newline - // Whitespace surrounding content inside backticks is trimmed - private static readonly IMatcher MultilineCodeBlockNodeMatcher = new RegexMatcher( + // Language identifier is one word immediately after opening backticks, followed immediately by newline + // Blank lines at the beginning and end of content are trimmed + private static readonly IMatcher MultiLineCodeBlockNodeMatcher = new RegexMatcher( new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline), - m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value.Trim())); + m => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))); /* Mentions */ // Capture @everyone private static readonly IMatcher EveryoneMentionNodeMatcher = new StringMatcher( "@everyone", - s => new MentionNode(s, "everyone", MentionType.Meta)); + p => new MentionNode("everyone", MentionType.Meta)); // Capture @here private static readonly IMatcher HereMentionNodeMatcher = new StringMatcher( "@here", - s => new MentionNode(s, "here", MentionType.Meta)); + p => new MentionNode("here", MentionType.Meta)); // Capture <@123456> or <@!123456> private static readonly IMatcher UserMentionNodeMatcher = new RegexMatcher( new Regex("<@!?(\\d+)>", DefaultRegexOptions), - m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User)); + m => new MentionNode(m.Groups[1].Value, MentionType.User)); // Capture <#123456> private static readonly IMatcher ChannelMentionNodeMatcher = new RegexMatcher( new Regex("<#(\\d+)>", DefaultRegexOptions), - m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel)); + m => new MentionNode(m.Groups[1].Value, MentionType.Channel)); // Capture <@&123456> private static readonly IMatcher RoleMentionNodeMatcher = new RegexMatcher( new Regex("<@&(\\d+)>", DefaultRegexOptions), - m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role)); + m => new MentionNode(m.Groups[1].Value, MentionType.Role)); /* Emojis */ @@ -108,29 +108,29 @@ namespace DiscordChatExporter.Core.Markdown // (this does not match all emojis in Discord but it's reasonably accurate enough) private static readonly IMatcher StandardEmojiNodeMatcher = new RegexMatcher( new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|\\p{So}|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions), - m => new EmojiNode(m.Value, m.Groups[1].Value)); + m => new EmojiNode(m.Groups[1].Value)); // Capture <:lul:123456> or private static readonly IMatcher CustomEmojiNodeMatcher = new RegexMatcher( new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions), - m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, !m.Groups[1].Value.IsNullOrWhiteSpace())); + m => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !m.Groups[1].Value.IsNullOrWhiteSpace())); /* Links */ // Capture [title](link) private static readonly IMatcher TitledLinkNodeMatcher = new RegexMatcher( new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions), - m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value)); + m => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)); // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace private static readonly IMatcher AutoLinkNodeMatcher = new RegexMatcher( new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions), - m => new LinkNode(m.Value, m.Groups[1].Value)); + m => new LinkNode(m.Groups[1].Value)); // Same as auto link but also surrounded by angular brackets private static readonly IMatcher HiddenLinkNodeMatcher = new RegexMatcher( new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions), - m => new LinkNode(m.Value, m.Groups[1].Value)); + m => new LinkNode(m.Groups[1].Value)); /* Text */ @@ -138,25 +138,25 @@ namespace DiscordChatExporter.Core.Markdown // This escapes it from matching for formatting private static readonly IMatcher ShrugTextNodeMatcher = new StringMatcher( @"¯\_(ツ)_/¯", - s => new TextNode(s)); + p => new TextNode(p.ToString())); // Capture some specific emojis that don't get rendered // This escapes it from matching for emoji private static readonly IMatcher IgnoredEmojiTextNodeMatcher = new RegexMatcher( new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions), - m => new TextNode(m.Value, m.Groups[1].Value)); + m => new TextNode(m.Groups[1].Value)); // Capture any "symbol/other" character or surrogate pair preceded by a backslash // This escapes it from matching for emoji private static readonly IMatcher EscapedSymbolTextNodeMatcher = new RegexMatcher( new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions), - m => new TextNode(m.Value, m.Groups[1].Value)); + m => new TextNode(m.Groups[1].Value)); // Capture any non-whitespace, non latin alphanumeric character preceded by a backslash // This escapes it from matching for formatting or other tokens private static readonly IMatcher EscapedCharacterTextNodeMatcher = new RegexMatcher( new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions), - m => new TextNode(m.Value, m.Groups[1].Value)); + m => new TextNode(m.Groups[1].Value)); // Combine all matchers into one // Matchers that have similar patterns are ordered from most specific to least specific @@ -178,7 +178,7 @@ namespace DiscordChatExporter.Core.Markdown SpoilerFormattedNodeMatcher, // Code blocks - MultilineCodeBlockNodeMatcher, + MultiLineCodeBlockNodeMatcher, InlineCodeBlockNodeMatcher, // Mentions @@ -197,9 +197,27 @@ namespace DiscordChatExporter.Core.Markdown StandardEmojiNodeMatcher, CustomEmojiNodeMatcher); - private static IReadOnlyList Parse(string input, IMatcher matcher) => - matcher.MatchAll(input, s => new TextNode(s)).Select(r => r.Value).ToArray(); + private static readonly IMatcher MinimalAggregateNodeMatcher = new AggregateMatcher( + // Mentions + EveryoneMentionNodeMatcher, + HereMentionNodeMatcher, + UserMentionNodeMatcher, + ChannelMentionNodeMatcher, + RoleMentionNodeMatcher, - public static IReadOnlyList Parse(string input) => Parse(input, AggregateNodeMatcher); + // Emoji + StandardEmojiNodeMatcher, + CustomEmojiNodeMatcher); + + private static IReadOnlyList Parse(StringPart stringPart, IMatcher matcher) => + matcher.MatchAll(stringPart, p => new TextNode(p.ToString())).Select(r => r.Value).ToArray(); + + private static IReadOnlyList Parse(StringPart stringPart) => Parse(stringPart, AggregateNodeMatcher); + + private static IReadOnlyList ParseMinimal(StringPart stringPart) => Parse(stringPart, MinimalAggregateNodeMatcher); + + public static IReadOnlyList Parse(string input) => Parse(new StringPart(input)); + + public static IReadOnlyList ParseMinimal(string input) => ParseMinimal(new StringPart(input)); } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs index 86cef1d5..798411ba 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs @@ -12,16 +12,15 @@ namespace DiscordChatExporter.Core.Markdown.Nodes public bool IsCustomEmoji => !Id.IsNullOrWhiteSpace(); - public EmojiNode(string source, string id, string name, bool isAnimated) - : base(source) + public EmojiNode(string id, string name, bool isAnimated) { Id = id; Name = name; IsAnimated = isAnimated; } - public EmojiNode(string source, string name) - : this(source, null, name, false) + public EmojiNode(string name) + : this(null, name, false) { } diff --git a/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs index 30808da1..ee540eb6 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs @@ -4,16 +4,12 @@ namespace DiscordChatExporter.Core.Markdown.Nodes { public class FormattedNode : Node { - public string Token { get; } - public TextFormatting Formatting { get; } public IReadOnlyList Children { get; } - public FormattedNode(string source, string token, TextFormatting formatting, IReadOnlyList children) - : base(source) + public FormattedNode(TextFormatting formatting, IReadOnlyList children) { - Token = token; Formatting = formatting; Children = children; } diff --git a/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs index e5b85d99..6cb03917 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs @@ -4,8 +4,7 @@ { public string Code { get; } - public InlineCodeBlockNode(string source, string code) - : base(source) + public InlineCodeBlockNode(string code) { Code = code; } diff --git a/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs index 9c2e1770..5b5b6e6d 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs @@ -6,14 +6,14 @@ public string Title { get; } - public LinkNode(string source, string url, string title) - : base(source) + public LinkNode(string url, string title) { Url = url; Title = title; } - public LinkNode(string source, string url) : this(source, url, url) + public LinkNode(string url) + : this(url, url) { } diff --git a/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs index 3a054425..f1b3a794 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs @@ -6,8 +6,7 @@ public MentionType Type { get; } - public MentionNode(string source, string id, MentionType type) - : base(source) + public MentionNode(string id, MentionType type) { Id = id; Type = type; diff --git a/DiscordChatExporter.Core.Markdown/Nodes/MultilineCodeBlockNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/MultiLineCodeBlockNode.cs similarity index 65% rename from DiscordChatExporter.Core.Markdown/Nodes/MultilineCodeBlockNode.cs rename to DiscordChatExporter.Core.Markdown/Nodes/MultiLineCodeBlockNode.cs index 3f647bef..a69f4622 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/MultilineCodeBlockNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/MultiLineCodeBlockNode.cs @@ -1,13 +1,12 @@ namespace DiscordChatExporter.Core.Markdown.Nodes { - public class MultilineCodeBlockNode : Node + public class MultiLineCodeBlockNode : Node { public string Language { get; } public string Code { get; } - public MultilineCodeBlockNode(string source, string language, string code) - : base(source) + public MultiLineCodeBlockNode(string language, string code) { Language = language; Code = code; diff --git a/DiscordChatExporter.Core.Markdown/Nodes/Node.cs b/DiscordChatExporter.Core.Markdown/Nodes/Node.cs index 22f6462a..44e3a997 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/Node.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/Node.cs @@ -2,11 +2,5 @@ { public abstract class Node { - public string Source { get; } - - protected Node(string source) - { - Source = source; - } } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs b/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs index 8bc77778..a411cee4 100644 --- a/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs +++ b/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs @@ -4,16 +4,11 @@ { public string Text { get; } - public TextNode(string source, string text) - : base(source) + public TextNode(string text) { Text = text; } - public TextNode(string text) : this(text, text) - { - } - public override string ToString() => Text; } } \ No newline at end of file diff --git a/DiscordChatExporter.Core.Rendering/CsvChatLogRenderer.cs b/DiscordChatExporter.Core.Rendering/CsvChatLogRenderer.cs index 25cd099a..b79f7f75 100644 --- a/DiscordChatExporter.Core.Rendering/CsvChatLogRenderer.cs +++ b/DiscordChatExporter.Core.Rendering/CsvChatLogRenderer.cs @@ -27,18 +27,21 @@ namespace DiscordChatExporter.Core.Rendering private string FormatMarkdown(Node node) { - // Formatted node - if (node is FormattedNode formattedNode) + // Text node + if (node is TextNode textNode) { - // Recursively get inner text - var innerText = FormatMarkdown(formattedNode.Children); - - return $"{formattedNode.Token}{innerText}{formattedNode.Token}"; + return textNode.Text; } - // Non-meta mention node - if (node is MentionNode mentionNode && mentionNode.Type != MentionType.Meta) + // Mention node + if (node is MentionNode mentionNode) { + // Meta mention node + if (mentionNode.Type == MentionType.Meta) + { + return mentionNode.Id; + } + // User mention node if (mentionNode.Type == MentionType.User) { @@ -61,19 +64,19 @@ namespace DiscordChatExporter.Core.Rendering } } - // Custom emoji node - if (node is EmojiNode emojiNode && emojiNode.IsCustomEmoji) + // Emoji node + if (node is EmojiNode emojiNode) { - return $":{emojiNode.Name}:"; + return emojiNode.IsCustomEmoji ? $":{emojiNode.Name}:" : emojiNode.Name; } - // All other nodes - simply return source - return node.Source; + // Throw on unexpected nodes + throw new InvalidOperationException($"Unexpected node: [{node.GetType()}]."); } private string FormatMarkdown(IEnumerable nodes) => nodes.Select(FormatMarkdown).JoinToString(""); - private string FormatMarkdown(string markdown) => FormatMarkdown(MarkdownParser.Parse(markdown)); + private string FormatMarkdown(string markdown) => FormatMarkdown(MarkdownParser.ParseMinimal(markdown)); private async Task RenderFieldAsync(TextWriter writer, string value) { diff --git a/DiscordChatExporter.Core.Rendering/HtmlChatLogRenderer.cs b/DiscordChatExporter.Core.Rendering/HtmlChatLogRenderer.cs index 3d3028c5..433c26d0 100644 --- a/DiscordChatExporter.Core.Rendering/HtmlChatLogRenderer.cs +++ b/DiscordChatExporter.Core.Rendering/HtmlChatLogRenderer.cs @@ -90,7 +90,7 @@ namespace DiscordChatExporter.Core.Rendering } // Multi-line code block node - if (node is MultilineCodeBlockNode multilineCodeBlockNode) + if (node is MultiLineCodeBlockNode multilineCodeBlockNode) { // Set CSS class for syntax highlighting var highlightCssClass = !multilineCodeBlockNode.Language.IsNullOrWhiteSpace() @@ -154,14 +154,14 @@ namespace DiscordChatExporter.Core.Rendering : $"{HtmlEncode(linkNode.Title)}"; } - // All other nodes - simply return source - return node.Source; + // Throw on unexpected nodes + throw new InvalidOperationException($"Unexpected node: [{node.GetType()}]."); } private string FormatMarkdown(IReadOnlyList nodes, bool isTopLevel) { - // Emojis are jumbo if all top-level nodes are emoji nodes, disregarding whitespace - var isJumbo = isTopLevel && nodes.Where(n => !n.Source.IsNullOrWhiteSpace()).All(n => n is EmojiNode); + // Emojis are jumbo if all top-level nodes are emoji nodes or whitespace text nodes + var isJumbo = isTopLevel && nodes.All(n => n is EmojiNode || n is TextNode textNode && textNode.Text.IsNullOrWhiteSpace()); return nodes.Select(n => FormatMarkdown(n, isJumbo)).JoinToString(""); } diff --git a/DiscordChatExporter.Core.Rendering/PlainTextChatLogRenderer.cs b/DiscordChatExporter.Core.Rendering/PlainTextChatLogRenderer.cs index d17f8f76..5cc6bdbb 100644 --- a/DiscordChatExporter.Core.Rendering/PlainTextChatLogRenderer.cs +++ b/DiscordChatExporter.Core.Rendering/PlainTextChatLogRenderer.cs @@ -45,18 +45,21 @@ namespace DiscordChatExporter.Core.Rendering private string FormatMarkdown(Node node) { - // Formatted node - if (node is FormattedNode formattedNode) + // Text node + if (node is TextNode textNode) { - // Recursively get inner text - var innerText = FormatMarkdown(formattedNode.Children); - - return $"{formattedNode.Token}{innerText}{formattedNode.Token}"; + return textNode.Text; } - // Non-meta mention node - if (node is MentionNode mentionNode && mentionNode.Type != MentionType.Meta) + // Mention node + if (node is MentionNode mentionNode) { + // Meta mention node + if (mentionNode.Type == MentionType.Meta) + { + return mentionNode.Id; + } + // User mention node if (mentionNode.Type == MentionType.User) { @@ -79,19 +82,19 @@ namespace DiscordChatExporter.Core.Rendering } } - // Custom emoji node - if (node is EmojiNode emojiNode && emojiNode.IsCustomEmoji) + // Emoji node + if (node is EmojiNode emojiNode) { - return $":{emojiNode.Name}:"; + return emojiNode.IsCustomEmoji ? $":{emojiNode.Name}:" : emojiNode.Name; } - // All other nodes - simply return source - return node.Source; + // Throw on unexpected nodes + throw new InvalidOperationException($"Unexpected node: [{node.GetType()}]."); } private string FormatMarkdown(IEnumerable nodes) => nodes.Select(FormatMarkdown).JoinToString(""); - private string FormatMarkdown(string markdown) => FormatMarkdown(MarkdownParser.Parse(markdown)); + private string FormatMarkdown(string markdown) => FormatMarkdown(MarkdownParser.ParseMinimal(markdown)); private async Task RenderAttachmentsAsync(TextWriter writer, IReadOnlyList attachments) { diff --git a/DiscordChatExporter.Core.Rendering/Resources/HtmlShared.css b/DiscordChatExporter.Core.Rendering/Resources/HtmlShared.css index 890b74c0..d0760aa7 100644 --- a/DiscordChatExporter.Core.Rendering/Resources/HtmlShared.css +++ b/DiscordChatExporter.Core.Rendering/Resources/HtmlShared.css @@ -58,7 +58,7 @@ img { } .pre { - font-family: "Consolas", "Courier New", Courier, Monospace; + font-family: "Consolas", "Courier New", Courier, monospace; } .pre--multiline {