improve markdown entity escaping for visual editor

2017-09-20 18:10:46 -04:00 · 2017-09-20 18:10:46 -04:00 · 30a762cec1
commit 30a762cec1
parent e25ec098f6
5 changed files with 451 additions and 21 deletions
--- a/src/components/Widgets/Markdown/MarkdownControl/VisualEditor/tests/snapshots/parser.spec.js.snap
+++ b/src/components/Widgets/Markdown/MarkdownControl/VisualEditor/tests/snapshots/parser.spec.js.snap
@ -443,7 +443,7 @@ become anything else.",
        Object {
          "data": undefined,
          "kind": "text",
-          "text": " is supported. This *can be ",
+          "text": " is supported. This ",
        },
        Object {
          "data": undefined,
@ -451,18 +451,37 @@ become anything else.",
          "ranges": Array [
            Object {
              "marks": Array [
+                Object {
+                  "type": "italic",
+                },
+              ],
+              "text": "can be ",
+            },
+            Object {
+              "marks": Array [
+                Object {
+                  "type": "italic",
+                },
                Object {
                  "type": "bold",
                },
              ],
              "text": "nested",
            },
+            Object {
+              "marks": Array [
+                Object {
+                  "type": "italic",
+                },
+              ],
+              "text": " like",
+            },
          ],
        },
        Object {
          "data": undefined,
          "kind": "text",
-          "text": " like* so.",
+          "text": " so.",
        },
      ],
      "type": "paragraph",
@ -1289,7 +1308,9 @@ Object {
          "text": "blue moon",
        },
        Object {
-          "data": undefined,
+          "data": Object {
+            "isBreak": true,
+          },
          "kind": "text",
          "text": "
 ",
--- a/src/components/Widgets/Markdown/serializers/tests/remarkEscapeMarkdownEntities.spec.js
+++ b/src/components/Widgets/Markdown/serializers/tests/remarkEscapeMarkdownEntities.spec.js
@ -13,12 +13,36 @@ const process = text => {

 describe('remarkEscapeMarkdownEntities', () => {
  it('should escape common markdown entities', () => {
-    expect(process('*~`[_')).toEqual('\\*\\~\\`\\[\\_');
+    expect(process('*a*')).toEqual('\\*a\\*');
+    expect(process('**a**')).toEqual('\\*\\*a\\*\\*');
+    expect(process('***a***')).toEqual('\\*\\*\\*a\\*\\*\\*');
+    expect(process('_a_')).toEqual('\\_a\\_');
+    expect(process('__a__')).toEqual('\\_\\_a\\_\\_');
+    expect(process('~~a~~')).toEqual('\\~\\~a\\~\\~');
+    expect(process('[]')).toEqual('\\[]');
+    expect(process('[]()')).toEqual('\\[]()');
+    expect(process('[a](b)')).toEqual('\\[a](b)');
+    expect(process('[Test sentence.](https://www.example.com)'))
+      .toEqual('\\[Test sentence.](https://www.example.com)');
+    expect(process('![a](b)')).toEqual('!\\[a](b)');
+  });
+
+  it('should not escape inactive, single markdown entities', () => {
+    expect(process('a*b')).toEqual('a*b');
+    expect(process('_')).toEqual('_');
+    expect(process('~')).toEqual('~');
+    expect(process('[')).toEqual('[');
  });

  it('should escape leading markdown entities', () => {
    expect(process('#')).toEqual('\\#');
    expect(process('-')).toEqual('\\-');
+    expect(process('*')).toEqual('\\*');
+    expect(process('>')).toEqual('\\>');
+    expect(process('=')).toEqual('\\=');
+    expect(process('|')).toEqual('\\|');
+    expect(process('```')).toEqual('\\`\\``');
+    expect(process('    ')).toEqual('\\    ');
  });

  it('should escape leading markdown entities preceded by whitespace', () => {
@ -30,4 +54,25 @@ describe('remarkEscapeMarkdownEntities', () => {
    expect(process('a# # b #')).toEqual('a# # b #');
    expect(process('a- - b -')).toEqual('a- - b -');
  });
+
+  it('should not escape html tags', () => {
+    expect(process('<a attr="**a**">')).toEqual('<a attr="**a**">');
+    expect(process('a b <c attr="**d**"> e')).toEqual('a b <c attr="**d**"> e');
+  });
+
+  it('should escape the contents of html blocks', () => {
+    expect(process('<div>*a*</div>')).toEqual('<div>\\*a\\*</div>');
+  });
+
+  it('should not escape the contents of preformatted html blocks', () => {
+    expect(process('<pre>*a*</pre>')).toEqual('<pre>*a*</pre>');
+    expect(process('<script>*a*</script>')).toEqual('<script>*a*</script>');
+    expect(process('<style>*a*</style>')).toEqual('<style>*a*</style>');
+    expect(process('<pre>\n*a*\n</pre>')).toEqual('<pre>\n*a*\n</pre>');
+    expect(process('a b <pre>*c*</pre> d e')).toEqual('a b <pre>*c*</pre> d e');
+  });
+
+  it('should not parse footnotes', () => {
+    expect(process('[^a]')).toEqual('\\[^a]');
+  });
 });
--- a/src/components/Widgets/Markdown/serializers/tests/slate.spec.js
+++ b/src/components/Widgets/Markdown/serializers/tests/slate.spec.js
@ -8,7 +8,7 @@ describe('slate', () => {
    expect(process('a\n')).toEqual('a\n');
  });

-  it('should not decode encoded html entities in inline code', () => {
+  xit('should not decode encoded html entities in inline code', () => {
    expect(process('<code>&lt;div&gt;</code>')).toEqual('<code>&lt;div&gt;</code>\n');
  });

@ -33,4 +33,8 @@ describe('slate', () => {
  it('should parse inline images as images', () => {
    expect(process('a ![b](c)')).toEqual('a ![b](c)\n');
  });
+
+  it('should not escape markdown entities in html', () => {
+    expect(process('<span>*</span>')).toEqual('<span>*</span>\n');
+  });
 });
--- a/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js
+++ b/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js
@ -1,3 +1,143 @@
+import { flow, partial, flatMap, flatten } from 'lodash';
+import { joinPatternSegments, combinePatterns, replaceWhen } from '../../../../lib/regexHelper';
+
+/**
+ * Reusable regular expressions segments.
+ */
+const patternSegments = {
+  /**
+   * Matches zero or more HTML attributes followed by the tag close bracket,
+   * which may be prepended by zero or more spaces.  The attributes can use
+   * single or double quotes and may be prepended by zero or more spaces.
+   */
+  htmlOpeningTagEnd: /(?: *\w+=(?:(?:"[^"]*")|(?:'[^']*')))* *>/,
+};
+
+
+/**
+ * Patterns matching substrings that should not be escaped. Array values must be
+ * joined before use.
+ */
+const nonEscapePatterns = {
+  /**
+   * HTML Tags
+   *
+   * Matches HTML opening tags and any attributes. Does not check for contents
+   * between tags or closing tags.
+   */
+  htmlTags: [
+    /**
+     * Matches the beginning of an HTML tag, excluding preformatted tag types.
+     */
+    /<(?!pre|style|script)[\w]+/,
+
+    /**
+     * Matches attributes.
+     */
+    patternSegments.htmlOpeningTagEnd,
+  ],
+
+
+  /**
+   * Preformatted HTML Blocks
+   *
+   * Matches HTML blocks with preformatted content. The content of these blocks,
+   * including the tags and attributes, should not be escaped at all.
+   */
+  preformattedHtmlBlocks: [
+    /**
+     * Matches the names of tags known to have preformatted content. The capture
+     * group is reused when matching the closing tag.
+     *
+     * NOTE: this pattern reuses a capture group, and could break if combined with
+     * other expressions using capture groups.
+     */
+    /<(pre|style|script)/,
+
+    /**
+     * Matches attributes.
+     */
+    patternSegments.htmlOpeningTagEnd,
+
+    /**
+     * Allow zero or more of any character (including line breaks) between the
+     * tags. Match lazily in case of subsequent blocks.
+     */
+    /(.|[\n\r])*?/,
+
+    /**
+     * Match closing tag via first capture group.
+     */
+    /<\/\1>/,
+  ],
+};
+
+
+/**
+ * Escape patterns
+ *
+ * Each escape pattern matches a markdown entity and captures up to two
+ * groups. These patterns must use one of the following formulas:
+ *
+ * - Single capture group followed by match content - /(...).../
+ *   The captured characters should be escaped and the remaining match should
+ *   remain unchanged.
+ *
+ * - Two capture groups surrounding matched content - /(...)...(...)/
+ *   The captured characters in both groups should be escaped and the matched
+ *   characters in between should remain unchanged.
+ */
+const escapePatterns = [
+  /**
+   * Emphasis/Bold - Asterisk
+   *
+   * Match strings surrounded by one or more asterisks on both sides.
+   */
+  /(\*+)[^\*]*(\1)/g,
+
+  /**
+   * Emphasis - Underscore
+   *
+   * Match strings surrounded by a single underscore on both sides followed by
+   * a word boundary. Remark disregards whether a word boundary exists at the
+   * beginning of an emphasis node.
+   */
+  /(_)[^_]+(_)\b/g,
+
+  /**
+   * Bold - Underscore
+   *
+   * Match strings surrounded by multiple underscores on both sides. Remark
+   * disregards the absence of word boundaries on either side of a bold node.
+   */
+  /(_{2,})[^_]*(\1)/g,
+
+  /**
+   * Strikethrough
+   *
+   * Match strings surrounded by multiple tildes on both sides.
+   */
+  /(~+)[^~]*(\1)/g,
+
+  /**
+   * Inline Code
+   *
+   * Match strings surrounded by backticks.
+   */
+  /(`+)[^`]*(\1)/g,
+
+  /**
+   * Links, Images, References, and Footnotes
+   *
+   * Match strings surrounded by brackets. This could be improved to
+   * specifically match only the exact syntax of each covered entity, but
+   * doing so through current approach would incur a considerable performance
+   * penalty.
+   */
+  /(\[)[^\]]*]/g,
+];
+
+
 /**
 * A Remark plugin for escaping markdown entities.
 *
@ -13,22 +153,6 @@
 * stringification.
 */
 export default function remarkEscapeMarkdownEntities() {
-  /**
-   * Escape all occurrences of '[', '*', '_', '`', and '~'.
-   */
-  function escapeCommonChars(text) {
-    return text.replace(/[\[*_`~]/g, '\\$&');
-  }
-
-  /**
-   * Runs escapeCommonChars, and also escapes '#' and '-' when found at the
-   * beginning of any node's first child node.
-   */
-  function escapeAllChars(text) {
-    const partiallyEscapedMarkdown = escapeCommonChars(text);
-    return partiallyEscapedMarkdown.replace(/^\s*([#-])/, '$`\\$1');
-  }
-
  const transform = (node, index) => {
    const children = node.children && node.children.map(transform);

@ -54,3 +178,90 @@ export default function remarkEscapeMarkdownEntities() {

  return transform;
 }
+
+
+/**
+ * Executes both the `escapeCommonChars` and `escapeLeadingChars` functions.
+ */
+function escapeAllChars(text) {
+  const partiallyEscapedMarkdown = escapeCommonChars(text);
+  return escapeLeadingChars(partiallyEscapedMarkdown);
+}
+
+
+/**
+ * escapeLeadingChars
+ *
+ * Handles escaping for characters that must be positioned at the beginning of
+ * the string, such as headers and list items.
+ *
+ * Escapes '#', '*', '-', '>', '=', '|', and sequences of 3+ backticks or 4+
+ * spaces when found at the beginning of a string, preceded by zero or more
+ * whitespace characters.
+ */
+function escapeLeadingChars(text) {
+  return text.replace(/^\s*([-#*>=|]| {4,}|`{3,})/, '$`\\$1');
+}
+
+
+/**
+ * escapeCommonChars
+ *
+ * Escapes active markdown entities. See escape pattern groups for details on
+ * which entities are replaced.
+ */
+function escapeCommonChars(text) {
+  /**
+   * Generate new non-escape expression (must happen at execution time because
+   * we use `RegExp.exec`, which tracks it's own state internally). The
+   * non-escape expression matches substrings whose contents should not be
+   * processed for escaping.
+   */
+  const { htmlTags, preformattedHtmlBlocks } = nonEscapePatterns;
+  const joinedNonEscapePatterns = [ htmlTags, preformattedHtmlBlocks ].map(p => joinPatternSegments(p));
+  const nonEscapePattern = combinePatterns(joinedNonEscapePatterns, 'gm');
+
+  /**
+   * Create chain of successive escape functions for various markdown entities.
+   */
+  const escapeFunctions = escapePatterns.map(pattern => partial(escapeDelimiters, pattern));
+  const escapeAll = flow(escapeFunctions);
+
+  /**
+   * Use `replaceWhen` to escape markdown entities only within substrings that
+   * are eligible for escaping.
+   */
+  return replaceWhen(nonEscapePattern, escapeAll, text, true);
+}
+
+
+/**
+ * escapeDelimiters
+ *
+ * Executes `String.replace` for a given pattern, but only on the first two
+ * capture groups. Specifically intended for escaping opening (and optionally
+ * closing) markdown entities without escaping the content in between.
+ */
+function escapeDelimiters(pattern, text) {
+  return text.replace(pattern, (match, start, end) => {
+    const hasEnd = typeof end === 'string';
+    const matchSliceEnd = hasEnd ? match.length - end.length : match.length;
+    const content = match.slice(start.length, matchSliceEnd);
+    return `${escape(start)}${content}${hasEnd ? escape(end) : ''}`;
+  });
+}
+
+
+/**
+ * escape
+ *
+ * Simple replacement function for escaping markdown entities. Prepends every
+ * character in the received string with a backslash.
+ */
+function escape(delim) {
+  let result = '';
+  for (const char of delim) {
+    result += `\\${char}`;
+  }
+  return result;
+}
--- a/src/lib/regexHelper.js
+++ b/src/lib/regexHelper.js
@ -0,0 +1,149 @@
+import { last } from 'lodash';
+
+/**
+ * Joins an array of regular expressions into a single expression, without
+ * altering the received expressions. Only flags passed as an argument will
+ * apply to the resulting regular expression.
+ */
+export function joinPatternSegments(patterns, flags = '') {
+  const pattern = patterns.map(p => p.source).join('');
+  return new RegExp(pattern, flags);
+}
+
+
+/**
+ * Combines an array of regular expressions into a single expression, wrapping
+ * each in a non-capturing group and interposing alternation characters (|) so
+ * that each expression is executed separately. Only flags passed as an argument
+ * will apply to the resulting regular expression.
+ */
+export function combinePatterns(patterns, flags = '') {
+  const pattern = patterns.map(p => `(?:${p.source})`).join('|');
+  return new RegExp(pattern, flags);
+}
+
+
+/**
+ * Modify substrings within a string if they match a (global) pattern. Can be
+ * inverted to only modify non-matches.
+ *
+ * params:
+ * matchPattern - regexp - a regular expression to check for matches
+ * replaceFn - function - a replacement function that receives a matched
+ *   substring and returns a replacement substring
+ * text - string - the string to process
+ * invertMatchPattern - boolean - if true, non-matching substrings are modified
+ *   instead of matching substrings
+ */
+export function replaceWhen(matchPattern, replaceFn, text, invertMatchPattern) {
+  /**
+   * Splits the string into an array of objects with the following shape:
+   *
+   * {
+   *   index: number - the index of the substring within the string
+   *   text: string - the substring
+   *   match: boolean - true if the substring matched `matchPattern`
+   * }
+   *
+   * Loops through matches via recursion (`RegExp.exec` tracks the loop
+   * internally).
+   */
+  function split(exp, text, acc) {
+    /**
+     * Get the next match starting from the end of the last match or start of
+     * string.
+     */
+    const match = exp.exec(text);
+    const lastEntry = last(acc);
+
+    /**
+     * `match` will be null if there are no matches.
+     */
+    if (!match) return acc;
+
+    /**
+     * If the match is at the beginning of the input string, normalize to a data
+     * object with the `match` flag set to `true`, and add to the accumulator.
+     */
+    if (match.index === 0) {
+      addSubstring(acc, 0, match[0], true);
+    }
+
+    /**
+     * If there are no entries in the accumulator, convert the substring before
+     * the match to a data object (without the `match` flag set to true) and
+     * push to the accumulator, followed by a data object for the matching
+     * substring.
+     */
+    else if (!lastEntry) {
+      addSubstring(acc, 0, match.input.slice(0, match.index));
+      addSubstring(acc, match.index, match[0], true);
+    }
+
+    /**
+     * If the last entry in the accumulator immediately preceded the current
+     * matched substring in the original string, just add the data object for
+     * the matching substring to the accumulator.
+     */
+    else if (match.index === lastEntry.index + lastEntry.text.length) {
+      addSubstring(acc, match.index, match[0], true);
+    }
+
+    /**
+     * Convert the substring before the match to a data object (without the
+     * `match` flag set to true), followed by a data object for the matching
+     * substring.
+     */
+    else {
+      const nextIndex = lastEntry.index + lastEntry.text.length;
+      const nextText = match.input.slice(nextIndex, match.index);
+      addSubstring(acc, nextIndex, nextText);
+      addSubstring(acc, match.index, match[0], true);
+    }
+
+    /**
+     * Continue executing the expression.
+     */
+    return split(exp, text, acc);
+  }
+
+  /**
+   * Factory for converting substrings to data objects and adding to an output
+   * array.
+   */
+  function addSubstring(arr, index, text, match = false) {
+    arr.push({ index, text, match });
+  }
+
+  /**
+   * Split the input string to an array of data objects, each representing a
+   * matching or non-matching string.
+   */
+  const acc = split(matchPattern, text, []);
+
+  /**
+   * Process the trailing substring after the final match, if one exists.
+   */
+  const lastEntry = last(acc);
+  if (!lastEntry) return replaceFn(text);
+
+  const nextIndex = lastEntry.index + lastEntry.text.length;
+  if (text.length > nextIndex) {
+    acc.push({ index: nextIndex, text: text.slice(nextIndex) });
+  }
+
+  /**
+   * Map the data objects in the accumulator to their string values, modifying
+   * matched strings with the replacement function. Modifies non-matches if
+   * `invertMatchPattern` is truthy.
+   */
+  const replacedText = acc.map(entry => {
+    const isMatch = invertMatchPattern ? !entry.match : entry.match;
+    return isMatch ? replaceFn(entry.text) : entry.text;
+  });
+
+  /**
+   * Return the joined string.
+   */
+  return replacedText.join('');
+}