Improve performance (#162)

2025-06-01 07:18:23 -04:00 · 2019-04-10 23:45:21 +03:00 · 2019-04-10 23:45:21 +03:00 · 4bfb2ec7fd
commit 4bfb2ec7fd
parent 359278afec
86 changed files with 1242 additions and 900 deletions
--- a/DiscordChatExporter.Core.Markdown/DiscordChatExporter.Core.Markdown.csproj
+++ b/DiscordChatExporter.Core.Markdown/DiscordChatExporter.Core.Markdown.csproj
@ -5,8 +5,7 @@
  </PropertyGroup>

  <ItemGroup>
-    <PackageReference Include="Sprache" Version="2.2.0" />
-    <PackageReference Include="Tyrrrz.Extensions" Version="1.5.1" />
+    <PackageReference Include="Tyrrrz.Extensions" Version="1.6.0" />
  </ItemGroup>

-</Project>
+</Project>
--- a/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs
@ -0,0 +1,46 @@
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class AggregateMatcher<T> : IMatcher<T>
+    {
+        private readonly IReadOnlyList<IMatcher<T>> _matchers;
+
+        public AggregateMatcher(IReadOnlyList<IMatcher<T>> matchers)
+        {
+            _matchers = matchers;
+        }
+
+        public AggregateMatcher(params IMatcher<T>[] matchers)
+            : this((IReadOnlyList<IMatcher<T>>)matchers)
+        {
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            ParsedMatch<T> earliestMatch = null;
+
+            // Try to match the input with each matcher and get the match with the lowest start index
+            foreach (var matcher in _matchers)
+            {
+                // Try to match
+                var match = matcher.Match(input, startIndex, length);
+
+                // If there's no match - continue
+                if (match == null)
+                    continue;
+
+                // If this match is earlier than previous earliest - replace
+                if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex)
+                    earliestMatch = match;
+
+                // If the earliest match starts at the very beginning - break,
+                // because it's impossible to find a match earlier than that
+                if (earliestMatch.StartIndex == startIndex)
+                    break;
+            }
+
+            return earliestMatch;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs
@ -0,0 +1,50 @@
+using System;
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal static class Extensions
+    {
+        public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
+            int startIndex, int length, Func<string, T> fallbackTransform)
+        {
+            // Get end index for simplicity
+            var endIndex = startIndex + length;
+
+            // Loop through segments divided by individual matches
+            var currentIndex = startIndex;
+            while (currentIndex < endIndex)
+            {
+                // Find a match within this segment
+                var match = matcher.Match(input, currentIndex, endIndex - currentIndex);
+
+                // If there's no match - break
+                if (match == null)
+                    break;
+
+                // If this match doesn't start immediately at current index - transform and yield fallback first
+                if (match.StartIndex > currentIndex)
+                {
+                    var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex);
+                    yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
+                }
+
+                // Yield match
+                yield return match;
+
+                // Shift current index to the end of the match
+                currentIndex = match.StartIndex + match.Length;
+            }
+
+            // If EOL wasn't reached - transform and yield remaining part as fallback
+            if (currentIndex < endIndex)
+            {
+                var fallback = input.Substring(currentIndex);
+                yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
+            }
+        }
+
+        public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
+            Func<string, T> fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform);
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
@ -1,178 +0,0 @@
-using System.Collections.Generic;
-using System.Linq;
-using System.Text.RegularExpressions;
-using Sprache;
-using Tyrrrz.Extensions;
-
-namespace DiscordChatExporter.Core.Markdown.Internal
-{
-    // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
-    internal static class Grammar
-    {
-        /* Formatting */
-
-        // Capture until the earliest double asterisk not followed by an asterisk
-        private static readonly Parser<Node> BoldFormattedNode =
-            Parse.RegexMatch(new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "**", TextFormatting.Bold, BuildTree(m.Groups[1].Value)));
-
-        // Capture until the earliest single asterisk not preceded or followed by an asterisk
-        // Can't have whitespace right after opening or right before closing asterisk
-        private static readonly Parser<Node> ItalicFormattedNode =
-            Parse.RegexMatch(new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Can't have underscores inside
-        // Can't have word characters right after closing underscore
-        private static readonly Parser<Node> ItalicAltFormattedNode =
-            Parse.RegexMatch(new Regex("_([^_]+?)_(?!\\w)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Treated as a separate entity for simplicity
-        // Capture until the earliest triple asterisk not preceded or followed by an asterisk
-        private static readonly Parser<Node> ItalicBoldFormattedNode =
-            Parse.RegexMatch(new Regex("\\*(\\*\\*(?:.+?)\\*\\*)\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Capture until the earliest double underscore not followed by an underscore
-        private static readonly Parser<Node> UnderlineFormattedNode =
-            Parse.RegexMatch(new Regex("__(.+?)__(?!_)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "__", TextFormatting.Underline, BuildTree(m.Groups[1].Value)));
-
-        // Treated as a separate entity for simplicity
-        // Capture until the earliest triple underscore not preceded or followed by an underscore
-        private static readonly Parser<Node> ItalicUnderlineFormattedNode =
-            Parse.RegexMatch(new Regex("_(__(?:.+?)__)_(?!_)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Strikethrough is safe
-        private static readonly Parser<Node> StrikethroughFormattedNode =
-            Parse.RegexMatch(new Regex("~~(.+?)~~", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, BuildTree(m.Groups[1].Value)));
-
-        // Spoiler is safe
-        private static readonly Parser<Node> SpoilerFormattedNode =
-            Parse.RegexMatch(new Regex("\\|\\|(.+?)\\|\\|", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, BuildTree(m.Groups[1].Value)));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyFormattedNode = 
-            ItalicBoldFormattedNode.Or(ItalicUnderlineFormattedNode)
-            .Or(BoldFormattedNode).Or(ItalicFormattedNode)
-            .Or(UnderlineFormattedNode).Or(ItalicAltFormattedNode)
-            .Or(StrikethroughFormattedNode).Or(SpoilerFormattedNode);
-
-        /* Code blocks */
-
-        // Can't have backticks inside and surrounding whitespace is trimmed
-        private static readonly Parser<Node> InlineCodeBlockNode =
-            Parse.RegexMatch(new Regex("`\\s*([^`]+?)\\s*`", RegexOptions.Singleline))
-                .Select(m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value));
-
-        // The first word is a language identifier if it's the only word followed by a newline, the rest is code
-        private static readonly Parser<Node> MultilineCodeBlockNode =
-            Parse.RegexMatch(new Regex("```(?:(\\w*?)?(?:\\s*?\\n))?(.+?)```", RegexOptions.Singleline))
-                .Select(m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyCodeBlockNode = MultilineCodeBlockNode.Or(InlineCodeBlockNode);
-
-        /* Mentions */
-
-        // @everyone or @here
-        private static readonly Parser<Node> MetaMentionNode = Parse.RegexMatch("@(everyone|here)")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Meta));
-
-        // <@123456> or <@!123456>
-        private static readonly Parser<Node> UserMentionNode = Parse.RegexMatch("<@!?(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
-
-        // <#123456>
-        private static readonly Parser<Node> ChannelMentionNode = Parse.RegexMatch("<#(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
-
-        // <@&123456>
-        private static readonly Parser<Node> RoleMentionNode = Parse.RegexMatch("<@&(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyMentionNode =
-            MetaMentionNode.Or(UserMentionNode).Or(ChannelMentionNode).Or(RoleMentionNode);
-
-        /* Emojis */
-
-        // Matches all standard unicode emojis
-        private static readonly Parser<Node> StandardEmojiNode = Parse.RegexMatch(
-                "([\\u2700-\\u27bf]|" +
-                "(?:\\ud83c[\\udde6-\\uddff]){2}|" +
-                "[\\ud800-\\udbff][\\udc00-\\udfff]|" +
-                "[\\u0023-\\u0039]\\u20e3|" +
-                "\\u3299|\\u3297|\\u303d|\\u3030|\\u24c2|\\ud83c[\\udd70-\\udd71]|\\ud83c[\\udd7e-\\udd7f]|\\ud83c\\udd8e|\\ud83c[\\udd91-\\udd9a]|\\ud83c[\\udde6-\\uddff]|" +
-                "\\ud83c[\\ude01-\\ude02]|\\ud83c\\ude1a|\\ud83c\\ude2f|\\ud83c[\\ude32-\\ude3a]|\\ud83c[\\ude50-\\ude51]|\\u203c|\\u2049|[\\u25aa-\\u25ab]|" +
-                "\\u25b6|\\u25c0|[\\u25fb-\\u25fe]|\\u00a9|\\u00ae|\\u2122|\\u2139|\\ud83c\\udc04|[\\u2600-\\u26FF]|\\u2b05|\\u2b06|\\u2b07|\\u2b1b|\\u2b1c|\\u2b50|" +
-                "\\u2b55|\\u231a|\\u231b|\\u2328|\\u23cf|[\\u23e9-\\u23f3]|[\\u23f8-\\u23fa]|\\ud83c\\udccf|\\u2934|\\u2935|[\\u2190-\\u21ff])")
-            .Select(m => new EmojiNode(m.Value, m.Groups[1].Value));
-
-        // <:lul:123456> or <a:lul:123456>
-        private static readonly Parser<Node> CustomEmojiNode = Parse.RegexMatch("<(a)?:(.+?):(\\d+)>")
-            .Select(m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, m.Groups[1].Value.IsNotBlank()));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyEmojiNode = StandardEmojiNode.Or(CustomEmojiNode);
-
-        /* Links */
-
-        // [title](link)
-        private static readonly Parser<Node> TitledLinkNode = Parse.RegexMatch("\\[(.+?)\\]\\((.+?)\\)")
-            .Select(m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
-
-        // Starts with http:// or https://, stops at the last non-whitespace character followed by whitespace or punctuation character
-        private static readonly Parser<Node> AutoLinkNode = Parse.RegexMatch("(https?://\\S*[^\\.,:;\"\'\\s])")
-            .Select(m => new LinkNode(m.Value, m.Groups[1].Value));
-
-        // Autolink surrounded by angular brackets
-        private static readonly Parser<Node> HiddenLinkNode = Parse.RegexMatch("<(https?://\\S*[^\\.,:;\"\'\\s])>")
-            .Select(m => new LinkNode(m.Value, m.Groups[1].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyLinkNode = TitledLinkNode.Or(HiddenLinkNode).Or(AutoLinkNode); 
-
-        /* Text */
-
-        // Shrug is an exception and needs to be exempt from formatting
-        private static readonly Parser<Node> ShrugTextNode =
-            Parse.String("¯\\_(ツ)_/¯").Text().Select(s => new TextNode(s));
-
-        // Backslash escapes any following unicode surrogate pair
-        private static readonly Parser<Node> EscapedSurrogateTextNode =
-            from slash in Parse.Char('\\')
-            from high in Parse.AnyChar.Where(char.IsHighSurrogate)
-            from low in Parse.AnyChar
-            let lexeme = $"{slash}{high}{low}"
-            let text = $"{high}{low}"
-            select new TextNode(lexeme, text);
-
-        // Backslash escapes any following non-whitespace character except for digits and latin letters
-        private static readonly Parser<Node> EscapedTextNode =
-            Parse.RegexMatch("\\\\([^a-zA-Z0-9\\s])").Select(m => new TextNode(m.Value, m.Groups[1].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyTextNode = ShrugTextNode.Or(EscapedSurrogateTextNode).Or(EscapedTextNode);
-
-        /* Aggregator and fallback */
-
-        // Any node recognized by above patterns
-        private static readonly Parser<Node> AnyRecognizedNode = AnyFormattedNode.Or(AnyCodeBlockNode)
-            .Or(AnyMentionNode).Or(AnyEmojiNode).Or(AnyLinkNode).Or(AnyTextNode);
-
-        // Any node not recognized by above patterns (treated as plain text)
-        private static readonly Parser<Node> FallbackNode =
-            Parse.AnyChar.Except(AnyRecognizedNode).AtLeastOnce().Text().Select(s => new TextNode(s));
-
-        // Any node
-        private static readonly Parser<Node> AnyNode = AnyRecognizedNode.Or(FallbackNode);
-
-        // Entry point
-        public static IReadOnlyList<Node> BuildTree(string input) => AnyNode.Many().Parse(input).ToArray();
-    }
-}
--- a/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs
@ -0,0 +1,7 @@
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal interface IMatcher<T>
+    {
+        ParsedMatch<T> Match(string input, int startIndex, int length);
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs
@ -0,0 +1,18 @@
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal partial class ParsedMatch<T>
+    {
+        public int StartIndex { get; }
+
+        public int Length { get; }
+
+        public T Value { get; }
+
+        public ParsedMatch(int startIndex, int length, T value)
+        {
+            StartIndex = startIndex;
+            Length = length;
+            Value = value;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs
@ -0,0 +1,23 @@
+using System;
+using System.Text.RegularExpressions;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class RegexMatcher<T> : IMatcher<T>
+    {
+        private readonly Regex _regex;
+        private readonly Func<Match, T> _transform;
+
+        public RegexMatcher(Regex regex, Func<Match, T> transform)
+        {
+            _regex = regex;
+            _transform = transform;
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            var match = _regex.Match(input, startIndex, length);
+            return match.Success ? new ParsedMatch<T>(match.Index, match.Length, _transform(match)) : null;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs
@ -0,0 +1,29 @@
+using System;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class StringMatcher<T> : IMatcher<T>
+    {
+        private readonly string _needle;
+        private readonly StringComparison _comparison;
+        private readonly Func<string, T> _transform;
+
+        public StringMatcher(string needle, StringComparison comparison, Func<string, T> transform)
+        {
+            _needle = needle;
+            _comparison = comparison;
+            _transform = transform;
+        }
+
+        public StringMatcher(string needle, Func<string, T> transform)
+            : this(needle, StringComparison.Ordinal, transform)
+        {
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            var index = input.IndexOf(_needle, startIndex, length, _comparison);
+            return index >= 0 ? new ParsedMatch<T>(index, _needle.Length, _transform(_needle)) : null;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/MarkdownParser.cs
+++ b/DiscordChatExporter.Core.Markdown/MarkdownParser.cs
@ -1,10 +1,187 @@
 using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
 using DiscordChatExporter.Core.Markdown.Internal;
+using DiscordChatExporter.Core.Markdown.Nodes;
+using Tyrrrz.Extensions;

 namespace DiscordChatExporter.Core.Markdown
 {
+    // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
    public static class MarkdownParser
    {
-        public static IReadOnlyList<Node> Parse(string input) => Grammar.BuildTree(input);
+        private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant;
+
+        /* Formatting */
+
+        // Capture any character until the earliest double asterisk not followed by an asterisk
+        private static readonly IMatcher<Node> BoldFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "**", TextFormatting.Bold, Parse(m.Groups[1].Value)));
+
+        // Capture any character until the earliest single asterisk not preceded or followed by an asterisk
+        // Opening asterisk must not be followed by whitespace
+        // Closing asterisk must not be preceeded by whitespace
+        private static readonly IMatcher<Node> ItalicFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value)));
+
+        // Capture any character until the earliest triple asterisk not followed by an asterisk
+        private static readonly IMatcher<Node> ItalicBoldFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value, BoldFormattedNodeMatcher)));
+
+        // Capture any character except underscore until an underscore
+        // Closing underscore must not be followed by a word character
+        private static readonly IMatcher<Node> ItalicAltFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value)));
+
+        // Capture any character until the earliest double underscore not followed by an underscore
+        private static readonly IMatcher<Node> UnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "__", TextFormatting.Underline, Parse(m.Groups[1].Value)));
+
+        // Capture any character until the earliest triple underscore not followed by an underscore
+        private static readonly IMatcher<Node> ItalicUnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value, UnderlineFormattedNodeMatcher)));
+
+        // Capture any character until the earliest double tilde
+        private static readonly IMatcher<Node> StrikethroughFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, Parse(m.Groups[1].Value)));
+
+        // Capture any character until the earliest double pipe
+        private static readonly IMatcher<Node> SpoilerFormattedNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, Parse(m.Groups[1].Value)));
+
+        /* Code blocks */
+
+        // Capture any character except backtick until a backtick
+        // Whitespace surrounding content inside backticks is trimmed
+        private static readonly IMatcher<Node> InlineCodeBlockNodeMatcher = new RegexMatcher<Node>(
+            new Regex("`([^`]+)`", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value.Trim()));
+
+        // Capture language identifier and then any character until the earliest triple backtick
+        // Languge identifier is one word immediately after opening backticks, followed immediately by newline
+        // Whitespace surrounding content inside backticks is trimmed
+        private static readonly IMatcher<Node> MultilineCodeBlockNodeMatcher = new RegexMatcher<Node>(
+            new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
+            m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value.Trim()));
+
+        /* Mentions */
+
+        // Capture @everyone
+        private static readonly IMatcher<Node> EveryoneMentionNodeMatcher = new StringMatcher<Node>(
+            "@everyone",
+            s => new MentionNode(s, "everyone", MentionType.Meta));
+
+        // Capture @here
+        private static readonly IMatcher<Node> HereMentionNodeMatcher = new StringMatcher<Node>(
+            "@here",
+            s => new MentionNode(s, "here", MentionType.Meta));
+
+        // Capture <@123456> or <@!123456>
+        private static readonly IMatcher<Node> UserMentionNodeMatcher = new RegexMatcher<Node>(
+            new Regex("<@!?(\\d+)>", DefaultRegexOptions),
+            m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
+
+        // Capture <#123456>
+        private static readonly IMatcher<Node> ChannelMentionNodeMatcher = new RegexMatcher<Node>(
+            new Regex("<#(\\d+)>", DefaultRegexOptions),
+            m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
+
+        // Capture <@&123456>
+        private static readonly IMatcher<Node> RoleMentionNodeMatcher = new RegexMatcher<Node>(
+            new Regex("<@&(\\d+)>", DefaultRegexOptions),
+            m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
+
+        /* Emojis */
+
+        // Capture any country flag emoji (two regional indicator surrogate pairs)
+        // ... or "symbol/other" character
+        // ... or surrogate pair
+        // ... or digit followed by enclosing mark
+        // (this does not match all emojis in Discord but it's reasonably accurate enough)
+        private static readonly IMatcher<Node> StandardEmojiNodeMatcher = new RegexMatcher<Node>(
+            new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|\\p{So}|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions),
+            m => new EmojiNode(m.Value, m.Groups[1].Value));
+
+        // Capture <:lul:123456> or <a:lul:123456>
+        private static readonly IMatcher<Node> CustomEmojiNodeMatcher = new RegexMatcher<Node>(
+            new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions),
+            m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, !m.Groups[1].Value.IsEmpty()));
+
+        /* Links */
+
+        // Capture [title](link)
+        private static readonly IMatcher<Node> TitledLinkNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
+            m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
+
+        // Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
+        private static readonly IMatcher<Node> AutoLinkNodeMatcher = new RegexMatcher<Node>(
+            new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
+            m => new LinkNode(m.Value, m.Groups[1].Value));
+
+        // Same as auto link but also surrounded by angular brackets
+        private static readonly IMatcher<Node> HiddenLinkNodeMatcher = new RegexMatcher<Node>(
+            new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions),
+            m => new LinkNode(m.Value, m.Groups[1].Value));
+
+        /* Text */
+
+        // Capture the shrug emoticon
+        // This escapes it from matching for formatting
+        private static readonly IMatcher<Node> ShrugTextNodeMatcher = new StringMatcher<Node>(
+            @"¯\_(ツ)_/¯",
+            s => new TextNode(s));
+
+        // Capture any "symbol/other" character or surrogate pair preceeded by a backslash
+        // This escapes it from matching for emoji
+        private static readonly IMatcher<Node> EscapedSymbolTextNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions),
+            m => new TextNode(m.Value, m.Groups[1].Value));
+
+        // Capture any non-whitespace, non latin alphanumeric character preceeded by a backslash
+        // This escapes it from matching for formatting or other tokens
+        private static readonly IMatcher<Node> EscapedCharacterTextNodeMatcher = new RegexMatcher<Node>(
+            new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions),
+            m => new TextNode(m.Value, m.Groups[1].Value));
+
+        // Combine all matchers into one
+        // Matchers that have similar patterns are ordered from most specific to least specific
+        private static readonly IMatcher<Node> AggregateNodeMatcher = new AggregateMatcher<Node>(
+            ItalicBoldFormattedNodeMatcher,
+            ItalicUnderlineFormattedNodeMatcher,
+            BoldFormattedNodeMatcher,
+            ItalicFormattedNodeMatcher,
+            UnderlineFormattedNodeMatcher,
+            ItalicAltFormattedNodeMatcher,
+            StrikethroughFormattedNodeMatcher,
+            SpoilerFormattedNodeMatcher,
+            MultilineCodeBlockNodeMatcher,
+            InlineCodeBlockNodeMatcher,
+            EveryoneMentionNodeMatcher,
+            HereMentionNodeMatcher,
+            UserMentionNodeMatcher,
+            ChannelMentionNodeMatcher,
+            RoleMentionNodeMatcher,
+            StandardEmojiNodeMatcher,
+            CustomEmojiNodeMatcher,
+            TitledLinkNodeMatcher,
+            AutoLinkNodeMatcher,
+            HiddenLinkNodeMatcher,
+            ShrugTextNodeMatcher,
+            EscapedSymbolTextNodeMatcher,
+            EscapedCharacterTextNodeMatcher);
+
+        private static IReadOnlyList<Node> Parse(string input, IMatcher<Node> matcher) =>
+            matcher.MatchAll(input, s => new TextNode(s)).Select(r => r.Value).ToArray();
+
+        public static IReadOnlyList<Node> Parse(string input) => Parse(input, AggregateNodeMatcher);
    }
 }
--- a/DiscordChatExporter.Core.Markdown/Node.cs
+++ b/DiscordChatExporter.Core.Markdown/Node.cs
@ -1,12 +0,0 @@
-namespace DiscordChatExporter.Core.Markdown
-{
-    public abstract class Node
-    {
-        public string Lexeme { get; }
-
-        protected Node(string lexeme)
-        {
-            Lexeme = lexeme;
-        }
-    }
-}
--- a/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/EmojiNode.cs
@ -1,6 +1,4 @@
-using Tyrrrz.Extensions;
-
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class EmojiNode : Node
    {
@ -10,18 +8,18 @@ namespace DiscordChatExporter.Core.Markdown

        public bool IsAnimated { get; }

-        public bool IsCustomEmoji => Id.IsNotBlank();
+        public bool IsCustomEmoji => Id != null;

-        public EmojiNode(string lexeme, string id, string name, bool isAnimated)
-            : base(lexeme)
+        public EmojiNode(string source, string id, string name, bool isAnimated)
+            : base(source)
        {
            Id = id;
            Name = name;
            IsAnimated = isAnimated;
        }

-        public EmojiNode(string lexeme, string name)
-            : this(lexeme, null, name, false)
+        public EmojiNode(string source, string name)
+            : this(source, null, name, false)
        {
        }

--- a/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/FormattedNode.cs
@ -1,6 +1,6 @@
 using System.Collections.Generic;

-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class FormattedNode : Node
    {
@ -10,8 +10,8 @@ namespace DiscordChatExporter.Core.Markdown

        public IReadOnlyList<Node> Children { get; }

-        public FormattedNode(string lexeme, string token, TextFormatting formatting, IReadOnlyList<Node> children)
-            : base(lexeme)
+        public FormattedNode(string source, string token, TextFormatting formatting, IReadOnlyList<Node> children)
+            : base(source)
        {
            Token = token;
            Formatting = formatting;
--- a/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/InlineCodeBlockNode.cs
@ -1,11 +1,11 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class InlineCodeBlockNode : Node
    {
        public string Code { get; }

-        public InlineCodeBlockNode(string lexeme, string code)
-            : base(lexeme)
+        public InlineCodeBlockNode(string source, string code)
+            : base(source)
        {
            Code = code;
        }
--- a/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/LinkNode.cs
@ -1,4 +1,4 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class LinkNode : Node
    {
@ -6,14 +6,14 @@

        public string Title { get; }

-        public LinkNode(string lexeme, string url, string title)
-            : base(lexeme)
+        public LinkNode(string source, string url, string title)
+            : base(source)
        {
            Url = url;
            Title = title;
        }

-        public LinkNode(string lexeme, string url) : this(lexeme, url, url)
+        public LinkNode(string source, string url) : this(source, url, url)
        {
        }

--- a/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/MentionNode.cs
@ -1,4 +1,4 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class MentionNode : Node
    {
@ -6,8 +6,8 @@

        public MentionType Type { get; }

-        public MentionNode(string lexeme, string id, MentionType type)
-            : base(lexeme)
+        public MentionNode(string source, string id, MentionType type)
+            : base(source)
        {
            Id = id;
            Type = type;
--- a/DiscordChatExporter.Core.Markdown/Nodes/MentionType.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/MentionType.cs
@ -1,4 +1,4 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public enum MentionType
    {
--- a/DiscordChatExporter.Core.Markdown/Nodes/MultilineCodeBlockNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/MultilineCodeBlockNode.cs
@ -1,4 +1,4 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class MultilineCodeBlockNode : Node
    {
@ -6,8 +6,8 @@

        public string Code { get; }

-        public MultilineCodeBlockNode(string lexeme, string language, string code)
-            : base(lexeme)
+        public MultilineCodeBlockNode(string source, string language, string code)
+            : base(source)
        {
            Language = language;
            Code = code;
--- a/DiscordChatExporter.Core.Markdown/Nodes/Node.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/Node.cs
@ -0,0 +1,12 @@
+namespace DiscordChatExporter.Core.Markdown.Nodes
+{
+    public abstract class Node
+    {
+        public string Source { get; }
+
+        protected Node(string source)
+        {
+            Source = source;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Nodes/TextFormatting.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/TextFormatting.cs
@ -1,4 +1,4 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public enum TextFormatting
    {
--- a/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs
+++ b/DiscordChatExporter.Core.Markdown/Nodes/TextNode.cs
@ -1,11 +1,11 @@
-namespace DiscordChatExporter.Core.Markdown
+namespace DiscordChatExporter.Core.Markdown.Nodes
 {
    public class TextNode : Node
    {
        public string Text { get; }

-        public TextNode(string lexeme, string text)
-            : base(lexeme)
+        public TextNode(string source, string text)
+            : base(source)
        {
            Text = text;
        }