import { has, flow, partial, map } from 'lodash'; import { joinPatternSegments, combinePatterns, replaceWhen } from '../regexHelper'; /** * Reusable regular expressions segments. */ const patternSegments = { /** * Matches zero or more HTML attributes followed by the tag close bracket, * which may be prepended by zero or more spaces. The attributes can use * single or double quotes and may be prepended by zero or more spaces. */ htmlOpeningTagEnd: /(?: *\w+=(?:(?:"[^"]*")|(?:'[^']*')))* *>/, }; /** * Patterns matching substrings that should not be escaped. Array values must be * joined before use. */ const nonEscapePatterns = { /** * HTML Tags * * Matches HTML opening tags and any attributes. Does not check for contents * between tags or closing tags. */ htmlTags: [ /** * Matches the beginning of an HTML tag, excluding preformatted tag types. */ /<(?!pre|style|script)[\w]+/, /** * Matches attributes. */ patternSegments.htmlOpeningTagEnd, ], /** * Preformatted HTML Blocks * * Matches HTML blocks with preformatted content. The content of these blocks, * including the tags and attributes, should not be escaped at all. */ preformattedHtmlBlocks: [ /** * Matches the names of tags known to have preformatted content. The capture * group is reused when matching the closing tag. * * NOTE: this pattern reuses a capture group, and could break if combined with * other expressions using capture groups. */ /<(pre|style|script)/, /** * Matches attributes. */ patternSegments.htmlOpeningTagEnd, /** * Allow zero or more of any character (including line breaks) between the * tags. Match lazily in case of subsequent blocks. */ /(.|[\n\r])*?/, /** * Match closing tag via first capture group. */ /<\/\1>/, ], }; /** * Escape patterns * * Each escape pattern matches a markdown entity and captures up to two * groups. These patterns must use one of the following formulas: * * - Single capture group followed by match content - /(...).../ * The captured characters should be escaped and the remaining match should * remain unchanged. * * - Two capture groups surrounding matched content - /(...)...(...)/ * The captured characters in both groups should be escaped and the matched * characters in between should remain unchanged. */ const escapePatterns = [ /** * Emphasis/Bold - Asterisk * * Match strings surrounded by one or more asterisks on both sides. */ /(\*+)[^*]*(\1)/g, /** * Emphasis - Underscore * * Match strings surrounded by a single underscore on both sides followed by * a word boundary. Remark disregards whether a word boundary exists at the * beginning of an emphasis node. */ /(_)[^_]+(_)\b/g, /** * Bold - Underscore * * Match strings surrounded by multiple underscores on both sides. Remark * disregards the absence of word boundaries on either side of a bold node. */ /(_{2,})[^_]*(\1)/g, /** * Strikethrough * * Match strings surrounded by multiple tildes on both sides. */ /(~+)[^~]*(\1)/g, /** * Inline Code * * Match strings surrounded by backticks. */ /(`+)[^`]*(\1)/g, /** * Links, Images, References, and Footnotes * * Match strings surrounded by brackets. This could be improved to * specifically match only the exact syntax of each covered entity, but * doing so through current approach would incur a considerable performance * penalty. */ /(\[)[^\]]*]/g, ]; /** * Generate new non-escape expression. The non-escape expression matches * substrings whose contents should not be processed for escaping. */ const joinedNonEscapePatterns = map(nonEscapePatterns, pattern => { return new RegExp(joinPatternSegments(pattern)); }); const nonEscapePattern = combinePatterns(joinedNonEscapePatterns); /** * Create chain of successive escape functions for various markdown entities. */ const escapeFunctions = escapePatterns.map(pattern => partial(escapeDelimiters, pattern)); const escapeAll = flow(escapeFunctions); /** * Executes both the `escapeCommonChars` and `escapeLeadingChars` functions. */ function escapeAllChars(text) { const partiallyEscapedMarkdown = escapeCommonChars(text); return escapeLeadingChars(partiallyEscapedMarkdown); } /** * escapeLeadingChars * * Handles escaping for characters that must be positioned at the beginning of * the string, such as headers and list items. * * Escapes '#', '*', '-', '>', '=', '|', and sequences of 3+ backticks or 4+ * spaces when found at the beginning of a string, preceded by zero or more * whitespace characters. */ function escapeLeadingChars(text) { return text.replace(/^\s*([-#*>=|]| {4,}|`{3,})/, '$`\\$1'); } /** * escapeCommonChars * * Escapes active markdown entities. See escape pattern groups for details on * which entities are replaced. */ function escapeCommonChars(text) { /** * Generate new non-escape expression (must happen at execution time because * we use `RegExp.exec`, which tracks it's own state internally). */ const nonEscapeExpression = new RegExp(nonEscapePattern, 'gm'); /** * Use `replaceWhen` to escape markdown entities only within substrings that * are eligible for escaping. */ return replaceWhen(nonEscapeExpression, escapeAll, text, true); } /** * escapeDelimiters * * Executes `String.replace` for a given pattern, but only on the first two * capture groups. Specifically intended for escaping opening (and optionally * closing) markdown entities without escaping the content in between. */ function escapeDelimiters(pattern, text) { return text.replace(pattern, (match, start, end) => { const hasEnd = typeof end === 'string'; const matchSliceEnd = hasEnd ? match.length - end.length : match.length; const content = match.slice(start.length, matchSliceEnd); return `${escape(start)}${content}${hasEnd ? escape(end) : ''}`; }); } /** * escape * * Simple replacement function for escaping markdown entities. Prepends every * character in the received string with a backslash. */ function escape(delim) { let result = ''; for (const char of delim) { result += `\\${char}`; } return result; } /** * A Remark plugin for escaping markdown entities. * * When markdown entities are entered in raw markdown, they don't appear as * characters in the resulting AST; for example, dashes surrounding a piece of * text cause the text to be inserted in a special node type, but the asterisks * themselves aren't present as text. Therefore, we generally don't expect to * encounter markdown characters in text nodes. * * However, the CMS visual editor does not interpret markdown characters, and * users will expect these characters to be represented literally. In that case, * we need to escape them, otherwise they'll be interpreted during * stringification. */ export default function remarkEscapeMarkdownEntities() { const transform = (node, index) => { /** * Shortcode nodes will intentionally inject markdown entities in text node * children not be escaped. */ if (has(node.data, 'shortcode')) return node; const children = node.children && node.children.map(transform); /** * Escape characters in text and html nodes only. We store a lot of normal * text in html nodes to keep Remark from escaping html entities. */ if (['text', 'html'].includes(node.type)) { /** * Escape all characters if this is the first child node, otherwise only * common characters. */ const value = index === 0 ? escapeAllChars(node.value) : escapeCommonChars(node.value); return { ...node, value, children }; } /** * Always return nodes with recursively mapped children. */ return { ...node, children }; }; return transform; }