Improve performance (#162)

This commit is contained in:
Alexey Golub 2019-04-10 23:45:21 +03:00 committed by GitHub
parent 359278afec
commit 4bfb2ec7fd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
86 changed files with 1242 additions and 900 deletions

View file

@ -5,8 +5,7 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Sprache" Version="2.2.0" />
<PackageReference Include="Tyrrrz.Extensions" Version="1.5.1" />
<PackageReference Include="Tyrrrz.Extensions" Version="1.6.0" />
</ItemGroup>
</Project>
</Project>

View file

@ -0,0 +1,46 @@
using System.Collections.Generic;
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal class AggregateMatcher<T> : IMatcher<T>
{
private readonly IReadOnlyList<IMatcher<T>> _matchers;
public AggregateMatcher(IReadOnlyList<IMatcher<T>> matchers)
{
_matchers = matchers;
}
public AggregateMatcher(params IMatcher<T>[] matchers)
: this((IReadOnlyList<IMatcher<T>>)matchers)
{
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
{
ParsedMatch<T> earliestMatch = null;
// Try to match the input with each matcher and get the match with the lowest start index
foreach (var matcher in _matchers)
{
// Try to match
var match = matcher.Match(input, startIndex, length);
// If there's no match - continue
if (match == null)
continue;
// If this match is earlier than previous earliest - replace
if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex)
earliestMatch = match;
// If the earliest match starts at the very beginning - break,
// because it's impossible to find a match earlier than that
if (earliestMatch.StartIndex == startIndex)
break;
}
return earliestMatch;
}
}
}

View file

@ -0,0 +1,50 @@
using System;
using System.Collections.Generic;
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal static class Extensions
{
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
int startIndex, int length, Func<string, T> fallbackTransform)
{
// Get end index for simplicity
var endIndex = startIndex + length;
// Loop through segments divided by individual matches
var currentIndex = startIndex;
while (currentIndex < endIndex)
{
// Find a match within this segment
var match = matcher.Match(input, currentIndex, endIndex - currentIndex);
// If there's no match - break
if (match == null)
break;
// If this match doesn't start immediately at current index - transform and yield fallback first
if (match.StartIndex > currentIndex)
{
var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex);
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
}
// Yield match
yield return match;
// Shift current index to the end of the match
currentIndex = match.StartIndex + match.Length;
}
// If EOL wasn't reached - transform and yield remaining part as fallback
if (currentIndex < endIndex)
{
var fallback = input.Substring(currentIndex);
yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
}
}
public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
Func<string, T> fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform);
}
}

View file

@ -1,178 +0,0 @@
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using Sprache;
using Tyrrrz.Extensions;
namespace DiscordChatExporter.Core.Markdown.Internal
{
// The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
internal static class Grammar
{
/* Formatting */
// Capture until the earliest double asterisk not followed by an asterisk
private static readonly Parser<Node> BoldFormattedNode =
Parse.RegexMatch(new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "**", TextFormatting.Bold, BuildTree(m.Groups[1].Value)));
// Capture until the earliest single asterisk not preceded or followed by an asterisk
// Can't have whitespace right after opening or right before closing asterisk
private static readonly Parser<Node> ItalicFormattedNode =
Parse.RegexMatch(new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
// Can't have underscores inside
// Can't have word characters right after closing underscore
private static readonly Parser<Node> ItalicAltFormattedNode =
Parse.RegexMatch(new Regex("_([^_]+?)_(?!\\w)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
// Treated as a separate entity for simplicity
// Capture until the earliest triple asterisk not preceded or followed by an asterisk
private static readonly Parser<Node> ItalicBoldFormattedNode =
Parse.RegexMatch(new Regex("\\*(\\*\\*(?:.+?)\\*\\*)\\*(?!\\*)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
// Capture until the earliest double underscore not followed by an underscore
private static readonly Parser<Node> UnderlineFormattedNode =
Parse.RegexMatch(new Regex("__(.+?)__(?!_)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "__", TextFormatting.Underline, BuildTree(m.Groups[1].Value)));
// Treated as a separate entity for simplicity
// Capture until the earliest triple underscore not preceded or followed by an underscore
private static readonly Parser<Node> ItalicUnderlineFormattedNode =
Parse.RegexMatch(new Regex("_(__(?:.+?)__)_(?!_)", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
// Strikethrough is safe
private static readonly Parser<Node> StrikethroughFormattedNode =
Parse.RegexMatch(new Regex("~~(.+?)~~", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, BuildTree(m.Groups[1].Value)));
// Spoiler is safe
private static readonly Parser<Node> SpoilerFormattedNode =
Parse.RegexMatch(new Regex("\\|\\|(.+?)\\|\\|", RegexOptions.Singleline))
.Select(m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, BuildTree(m.Groups[1].Value)));
// Combinator, order matters
private static readonly Parser<Node> AnyFormattedNode =
ItalicBoldFormattedNode.Or(ItalicUnderlineFormattedNode)
.Or(BoldFormattedNode).Or(ItalicFormattedNode)
.Or(UnderlineFormattedNode).Or(ItalicAltFormattedNode)
.Or(StrikethroughFormattedNode).Or(SpoilerFormattedNode);
/* Code blocks */
// Can't have backticks inside and surrounding whitespace is trimmed
private static readonly Parser<Node> InlineCodeBlockNode =
Parse.RegexMatch(new Regex("`\\s*([^`]+?)\\s*`", RegexOptions.Singleline))
.Select(m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value));
// The first word is a language identifier if it's the only word followed by a newline, the rest is code
private static readonly Parser<Node> MultilineCodeBlockNode =
Parse.RegexMatch(new Regex("```(?:(\\w*?)?(?:\\s*?\\n))?(.+?)```", RegexOptions.Singleline))
.Select(m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value));
// Combinator, order matters
private static readonly Parser<Node> AnyCodeBlockNode = MultilineCodeBlockNode.Or(InlineCodeBlockNode);
/* Mentions */
// @everyone or @here
private static readonly Parser<Node> MetaMentionNode = Parse.RegexMatch("@(everyone|here)")
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Meta));
// <@123456> or <@!123456>
private static readonly Parser<Node> UserMentionNode = Parse.RegexMatch("<@!?(\\d+)>")
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
// <#123456>
private static readonly Parser<Node> ChannelMentionNode = Parse.RegexMatch("<#(\\d+)>")
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
// <@&123456>
private static readonly Parser<Node> RoleMentionNode = Parse.RegexMatch("<@&(\\d+)>")
.Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
// Combinator, order matters
private static readonly Parser<Node> AnyMentionNode =
MetaMentionNode.Or(UserMentionNode).Or(ChannelMentionNode).Or(RoleMentionNode);
/* Emojis */
// Matches all standard unicode emojis
private static readonly Parser<Node> StandardEmojiNode = Parse.RegexMatch(
"([\\u2700-\\u27bf]|" +
"(?:\\ud83c[\\udde6-\\uddff]){2}|" +
"[\\ud800-\\udbff][\\udc00-\\udfff]|" +
"[\\u0023-\\u0039]\\u20e3|" +
"\\u3299|\\u3297|\\u303d|\\u3030|\\u24c2|\\ud83c[\\udd70-\\udd71]|\\ud83c[\\udd7e-\\udd7f]|\\ud83c\\udd8e|\\ud83c[\\udd91-\\udd9a]|\\ud83c[\\udde6-\\uddff]|" +
"\\ud83c[\\ude01-\\ude02]|\\ud83c\\ude1a|\\ud83c\\ude2f|\\ud83c[\\ude32-\\ude3a]|\\ud83c[\\ude50-\\ude51]|\\u203c|\\u2049|[\\u25aa-\\u25ab]|" +
"\\u25b6|\\u25c0|[\\u25fb-\\u25fe]|\\u00a9|\\u00ae|\\u2122|\\u2139|\\ud83c\\udc04|[\\u2600-\\u26FF]|\\u2b05|\\u2b06|\\u2b07|\\u2b1b|\\u2b1c|\\u2b50|" +
"\\u2b55|\\u231a|\\u231b|\\u2328|\\u23cf|[\\u23e9-\\u23f3]|[\\u23f8-\\u23fa]|\\ud83c\\udccf|\\u2934|\\u2935|[\\u2190-\\u21ff])")
.Select(m => new EmojiNode(m.Value, m.Groups[1].Value));
// <:lul:123456> or <a:lul:123456>
private static readonly Parser<Node> CustomEmojiNode = Parse.RegexMatch("<(a)?:(.+?):(\\d+)>")
.Select(m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, m.Groups[1].Value.IsNotBlank()));
// Combinator, order matters
private static readonly Parser<Node> AnyEmojiNode = StandardEmojiNode.Or(CustomEmojiNode);
/* Links */
// [title](link)
private static readonly Parser<Node> TitledLinkNode = Parse.RegexMatch("\\[(.+?)\\]\\((.+?)\\)")
.Select(m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
// Starts with http:// or https://, stops at the last non-whitespace character followed by whitespace or punctuation character
private static readonly Parser<Node> AutoLinkNode = Parse.RegexMatch("(https?://\\S*[^\\.,:;\"\'\\s])")
.Select(m => new LinkNode(m.Value, m.Groups[1].Value));
// Autolink surrounded by angular brackets
private static readonly Parser<Node> HiddenLinkNode = Parse.RegexMatch("<(https?://\\S*[^\\.,:;\"\'\\s])>")
.Select(m => new LinkNode(m.Value, m.Groups[1].Value));
// Combinator, order matters
private static readonly Parser<Node> AnyLinkNode = TitledLinkNode.Or(HiddenLinkNode).Or(AutoLinkNode);
/* Text */
// Shrug is an exception and needs to be exempt from formatting
private static readonly Parser<Node> ShrugTextNode =
Parse.String("¯\\_(ツ)_/¯").Text().Select(s => new TextNode(s));
// Backslash escapes any following unicode surrogate pair
private static readonly Parser<Node> EscapedSurrogateTextNode =
from slash in Parse.Char('\\')
from high in Parse.AnyChar.Where(char.IsHighSurrogate)
from low in Parse.AnyChar
let lexeme = $"{slash}{high}{low}"
let text = $"{high}{low}"
select new TextNode(lexeme, text);
// Backslash escapes any following non-whitespace character except for digits and latin letters
private static readonly Parser<Node> EscapedTextNode =
Parse.RegexMatch("\\\\([^a-zA-Z0-9\\s])").Select(m => new TextNode(m.Value, m.Groups[1].Value));
// Combinator, order matters
private static readonly Parser<Node> AnyTextNode = ShrugTextNode.Or(EscapedSurrogateTextNode).Or(EscapedTextNode);
/* Aggregator and fallback */
// Any node recognized by above patterns
private static readonly Parser<Node> AnyRecognizedNode = AnyFormattedNode.Or(AnyCodeBlockNode)
.Or(AnyMentionNode).Or(AnyEmojiNode).Or(AnyLinkNode).Or(AnyTextNode);
// Any node not recognized by above patterns (treated as plain text)
private static readonly Parser<Node> FallbackNode =
Parse.AnyChar.Except(AnyRecognizedNode).AtLeastOnce().Text().Select(s => new TextNode(s));
// Any node
private static readonly Parser<Node> AnyNode = AnyRecognizedNode.Or(FallbackNode);
// Entry point
public static IReadOnlyList<Node> BuildTree(string input) => AnyNode.Many().Parse(input).ToArray();
}
}

View file

@ -0,0 +1,7 @@
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal interface IMatcher<T>
{
ParsedMatch<T> Match(string input, int startIndex, int length);
}
}

View file

@ -0,0 +1,18 @@
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal partial class ParsedMatch<T>
{
public int StartIndex { get; }
public int Length { get; }
public T Value { get; }
public ParsedMatch(int startIndex, int length, T value)
{
StartIndex = startIndex;
Length = length;
Value = value;
}
}
}

View file

@ -0,0 +1,23 @@
using System;
using System.Text.RegularExpressions;
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal class RegexMatcher<T> : IMatcher<T>
{
private readonly Regex _regex;
private readonly Func<Match, T> _transform;
public RegexMatcher(Regex regex, Func<Match, T> transform)
{
_regex = regex;
_transform = transform;
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
{
var match = _regex.Match(input, startIndex, length);
return match.Success ? new ParsedMatch<T>(match.Index, match.Length, _transform(match)) : null;
}
}
}

View file

@ -0,0 +1,29 @@
using System;
namespace DiscordChatExporter.Core.Markdown.Internal
{
internal class StringMatcher<T> : IMatcher<T>
{
private readonly string _needle;
private readonly StringComparison _comparison;
private readonly Func<string, T> _transform;
public StringMatcher(string needle, StringComparison comparison, Func<string, T> transform)
{
_needle = needle;
_comparison = comparison;
_transform = transform;
}
public StringMatcher(string needle, Func<string, T> transform)
: this(needle, StringComparison.Ordinal, transform)
{
}
public ParsedMatch<T> Match(string input, int startIndex, int length)
{
var index = input.IndexOf(_needle, startIndex, length, _comparison);
return index >= 0 ? new ParsedMatch<T>(index, _needle.Length, _transform(_needle)) : null;
}
}
}

View file

@ -1,10 +1,187 @@
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using DiscordChatExporter.Core.Markdown.Internal;
using DiscordChatExporter.Core.Markdown.Nodes;
using Tyrrrz.Extensions;
namespace DiscordChatExporter.Core.Markdown
{
// The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
public static class MarkdownParser
{
public static IReadOnlyList<Node> Parse(string input) => Grammar.BuildTree(input);
private const RegexOptions DefaultRegexOptions = RegexOptions.Compiled | RegexOptions.CultureInvariant;
/* Formatting */
// Capture any character until the earliest double asterisk not followed by an asterisk
private static readonly IMatcher<Node> BoldFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "**", TextFormatting.Bold, Parse(m.Groups[1].Value)));
// Capture any character until the earliest single asterisk not preceded or followed by an asterisk
// Opening asterisk must not be followed by whitespace
// Closing asterisk must not be preceeded by whitespace
private static readonly IMatcher<Node> ItalicFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value)));
// Capture any character until the earliest triple asterisk not followed by an asterisk
private static readonly IMatcher<Node> ItalicBoldFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("\\*(\\*\\*.+?\\*\\*)\\*(?!\\*)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "*", TextFormatting.Italic, Parse(m.Groups[1].Value, BoldFormattedNodeMatcher)));
// Capture any character except underscore until an underscore
// Closing underscore must not be followed by a word character
private static readonly IMatcher<Node> ItalicAltFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("_([^_]+)_(?!\\w)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value)));
// Capture any character until the earliest double underscore not followed by an underscore
private static readonly IMatcher<Node> UnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "__", TextFormatting.Underline, Parse(m.Groups[1].Value)));
// Capture any character until the earliest triple underscore not followed by an underscore
private static readonly IMatcher<Node> ItalicUnderlineFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "_", TextFormatting.Italic, Parse(m.Groups[1].Value, UnderlineFormattedNodeMatcher)));
// Capture any character until the earliest double tilde
private static readonly IMatcher<Node> StrikethroughFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, Parse(m.Groups[1].Value)));
// Capture any character until the earliest double pipe
private static readonly IMatcher<Node> SpoilerFormattedNodeMatcher = new RegexMatcher<Node>(
new Regex("\\|\\|(.+?)\\|\\|", DefaultRegexOptions | RegexOptions.Singleline),
m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, Parse(m.Groups[1].Value)));
/* Code blocks */
// Capture any character except backtick until a backtick
// Whitespace surrounding content inside backticks is trimmed
private static readonly IMatcher<Node> InlineCodeBlockNodeMatcher = new RegexMatcher<Node>(
new Regex("`([^`]+)`", DefaultRegexOptions | RegexOptions.Singleline),
m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value.Trim()));
// Capture language identifier and then any character until the earliest triple backtick
// Languge identifier is one word immediately after opening backticks, followed immediately by newline
// Whitespace surrounding content inside backticks is trimmed
private static readonly IMatcher<Node> MultilineCodeBlockNodeMatcher = new RegexMatcher<Node>(
new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value.Trim()));
/* Mentions */
// Capture @everyone
private static readonly IMatcher<Node> EveryoneMentionNodeMatcher = new StringMatcher<Node>(
"@everyone",
s => new MentionNode(s, "everyone", MentionType.Meta));
// Capture @here
private static readonly IMatcher<Node> HereMentionNodeMatcher = new StringMatcher<Node>(
"@here",
s => new MentionNode(s, "here", MentionType.Meta));
// Capture <@123456> or <@!123456>
private static readonly IMatcher<Node> UserMentionNodeMatcher = new RegexMatcher<Node>(
new Regex("<@!?(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
// Capture <#123456>
private static readonly IMatcher<Node> ChannelMentionNodeMatcher = new RegexMatcher<Node>(
new Regex("<#(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
// Capture <@&123456>
private static readonly IMatcher<Node> RoleMentionNodeMatcher = new RegexMatcher<Node>(
new Regex("<@&(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
/* Emojis */
// Capture any country flag emoji (two regional indicator surrogate pairs)
// ... or "symbol/other" character
// ... or surrogate pair
// ... or digit followed by enclosing mark
// (this does not match all emojis in Discord but it's reasonably accurate enough)
private static readonly IMatcher<Node> StandardEmojiNodeMatcher = new RegexMatcher<Node>(
new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|\\p{So}|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions),
m => new EmojiNode(m.Value, m.Groups[1].Value));
// Capture <:lul:123456> or <a:lul:123456>
private static readonly IMatcher<Node> CustomEmojiNodeMatcher = new RegexMatcher<Node>(
new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions),
m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, !m.Groups[1].Value.IsEmpty()));
/* Links */
// Capture [title](link)
private static readonly IMatcher<Node> TitledLinkNodeMatcher = new RegexMatcher<Node>(
new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
// Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
private static readonly IMatcher<Node> AutoLinkNodeMatcher = new RegexMatcher<Node>(
new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
m => new LinkNode(m.Value, m.Groups[1].Value));
// Same as auto link but also surrounded by angular brackets
private static readonly IMatcher<Node> HiddenLinkNodeMatcher = new RegexMatcher<Node>(
new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions),
m => new LinkNode(m.Value, m.Groups[1].Value));
/* Text */
// Capture the shrug emoticon
// This escapes it from matching for formatting
private static readonly IMatcher<Node> ShrugTextNodeMatcher = new StringMatcher<Node>(
@"¯\_(ツ)_/¯",
s => new TextNode(s));
// Capture any "symbol/other" character or surrogate pair preceeded by a backslash
// This escapes it from matching for emoji
private static readonly IMatcher<Node> EscapedSymbolTextNodeMatcher = new RegexMatcher<Node>(
new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions),
m => new TextNode(m.Value, m.Groups[1].Value));
// Capture any non-whitespace, non latin alphanumeric character preceeded by a backslash
// This escapes it from matching for formatting or other tokens
private static readonly IMatcher<Node> EscapedCharacterTextNodeMatcher = new RegexMatcher<Node>(
new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions),
m => new TextNode(m.Value, m.Groups[1].Value));
// Combine all matchers into one
// Matchers that have similar patterns are ordered from most specific to least specific
private static readonly IMatcher<Node> AggregateNodeMatcher = new AggregateMatcher<Node>(
ItalicBoldFormattedNodeMatcher,
ItalicUnderlineFormattedNodeMatcher,
BoldFormattedNodeMatcher,
ItalicFormattedNodeMatcher,
UnderlineFormattedNodeMatcher,
ItalicAltFormattedNodeMatcher,
StrikethroughFormattedNodeMatcher,
SpoilerFormattedNodeMatcher,
MultilineCodeBlockNodeMatcher,
InlineCodeBlockNodeMatcher,
EveryoneMentionNodeMatcher,
HereMentionNodeMatcher,
UserMentionNodeMatcher,
ChannelMentionNodeMatcher,
RoleMentionNodeMatcher,
StandardEmojiNodeMatcher,
CustomEmojiNodeMatcher,
TitledLinkNodeMatcher,
AutoLinkNodeMatcher,
HiddenLinkNodeMatcher,
ShrugTextNodeMatcher,
EscapedSymbolTextNodeMatcher,
EscapedCharacterTextNodeMatcher);
private static IReadOnlyList<Node> Parse(string input, IMatcher<Node> matcher) =>
matcher.MatchAll(input, s => new TextNode(s)).Select(r => r.Value).ToArray();
public static IReadOnlyList<Node> Parse(string input) => Parse(input, AggregateNodeMatcher);
}
}

View file

@ -1,12 +0,0 @@
namespace DiscordChatExporter.Core.Markdown
{
public abstract class Node
{
public string Lexeme { get; }
protected Node(string lexeme)
{
Lexeme = lexeme;
}
}
}

View file

@ -1,6 +1,4 @@
using Tyrrrz.Extensions;
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class EmojiNode : Node
{
@ -10,18 +8,18 @@ namespace DiscordChatExporter.Core.Markdown
public bool IsAnimated { get; }
public bool IsCustomEmoji => Id.IsNotBlank();
public bool IsCustomEmoji => Id != null;
public EmojiNode(string lexeme, string id, string name, bool isAnimated)
: base(lexeme)
public EmojiNode(string source, string id, string name, bool isAnimated)
: base(source)
{
Id = id;
Name = name;
IsAnimated = isAnimated;
}
public EmojiNode(string lexeme, string name)
: this(lexeme, null, name, false)
public EmojiNode(string source, string name)
: this(source, null, name, false)
{
}

View file

@ -1,6 +1,6 @@
using System.Collections.Generic;
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class FormattedNode : Node
{
@ -10,8 +10,8 @@ namespace DiscordChatExporter.Core.Markdown
public IReadOnlyList<Node> Children { get; }
public FormattedNode(string lexeme, string token, TextFormatting formatting, IReadOnlyList<Node> children)
: base(lexeme)
public FormattedNode(string source, string token, TextFormatting formatting, IReadOnlyList<Node> children)
: base(source)
{
Token = token;
Formatting = formatting;

View file

@ -1,11 +1,11 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class InlineCodeBlockNode : Node
{
public string Code { get; }
public InlineCodeBlockNode(string lexeme, string code)
: base(lexeme)
public InlineCodeBlockNode(string source, string code)
: base(source)
{
Code = code;
}

View file

@ -1,4 +1,4 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class LinkNode : Node
{
@ -6,14 +6,14 @@
public string Title { get; }
public LinkNode(string lexeme, string url, string title)
: base(lexeme)
public LinkNode(string source, string url, string title)
: base(source)
{
Url = url;
Title = title;
}
public LinkNode(string lexeme, string url) : this(lexeme, url, url)
public LinkNode(string source, string url) : this(source, url, url)
{
}

View file

@ -1,4 +1,4 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class MentionNode : Node
{
@ -6,8 +6,8 @@
public MentionType Type { get; }
public MentionNode(string lexeme, string id, MentionType type)
: base(lexeme)
public MentionNode(string source, string id, MentionType type)
: base(source)
{
Id = id;
Type = type;

View file

@ -1,4 +1,4 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public enum MentionType
{

View file

@ -1,4 +1,4 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class MultilineCodeBlockNode : Node
{
@ -6,8 +6,8 @@
public string Code { get; }
public MultilineCodeBlockNode(string lexeme, string language, string code)
: base(lexeme)
public MultilineCodeBlockNode(string source, string language, string code)
: base(source)
{
Language = language;
Code = code;

View file

@ -0,0 +1,12 @@
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public abstract class Node
{
public string Source { get; }
protected Node(string source)
{
Source = source;
}
}
}

View file

@ -1,4 +1,4 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public enum TextFormatting
{

View file

@ -1,11 +1,11 @@
namespace DiscordChatExporter.Core.Markdown
namespace DiscordChatExporter.Core.Markdown.Nodes
{
public class TextNode : Node
{
public string Text { get; }
public TextNode(string lexeme, string text)
: base(lexeme)
public TextNode(string source, string text)
: base(source)
{
Text = text;
}