escape markdown entities entered as literal text

2017-08-29 17:29:43 -04:00
parent ac8df98407
commit 8e805cabd8
3 changed files with 112 additions and 2 deletions
--- a/src/components/Widgets/Markdown/serializers/tests/remarkEscapeMarkdownEntities.spec.js
+++ b/src/components/Widgets/Markdown/serializers/tests/remarkEscapeMarkdownEntities.spec.js
@ -0,0 +1,33 @@
+import unified from 'unified';
+import u from 'unist-builder';
+import remarkEscapeMarkdownEntities from '../remarkEscapeMarkdownEntities';
+
+const process = text => {
+  const tree = u('root', [ u('text', text) ]);
+  const escapedMdast = unified()
+    .use(remarkEscapeMarkdownEntities)
+    .runSync(tree);
+
+  return escapedMdast.children[0].value;
+};
+
+describe('remarkEscapeMarkdownEntities', () => {
+  it('should escape common markdown entities', () => {
+    expect(process('*~`[_')).toEqual('\\*\\~\\`\\[\\_');
+  });
+
+  it('should escape leading markdown entities', () => {
+    expect(process('#')).toEqual('\\#');
+    expect(process('-')).toEqual('\\-');
+  });
+
+  it('should escape leading markdown entities preceded by whitespace', () => {
+    expect(process('\n #')).toEqual('\\#');
+    expect(process(' \n-')).toEqual('\\-');
+  });
+
+  it('should not escape leading markdown entities preceded by non-whitespace characters', () => {
+    expect(process('a# # b #')).toEqual('a# # b #');
+    expect(process('a- - b -')).toEqual('a- - b -');
+  });
+});
--- a/src/components/Widgets/Markdown/serializers/index.js
+++ b/src/components/Widgets/Markdown/serializers/index.js
@ -16,6 +16,7 @@ import remarkToSlatePlugin from './remarkSlate';
 import remarkSquashReferences from './remarkSquashReferences';
 import remarkImagesToText from './remarkImagesToText';
 import remarkShortcodes from './remarkShortcodes';
+import remarkEscapeMarkdownEntities from './remarkEscapeMarkdownEntities'
 import slateToRemarkParser from './slateRemark';
 import registry from '../../../../lib/registry';

@ -164,10 +165,30 @@ export const remarkToMarkdown = obj => {
   */
  const mdast = obj || u('root', [u('paragraph', [u('text', '')])]);

+  const remarkToMarkdownPluginOpts = {
+    commonmark: true,
+    fences: true,
+    pedantic: true,
+    listItemIndent: '1',
+
+    // Settings to emulate the defaults from the Prosemirror editor, not
+    // necessarily optimal. Should eventually be configurable.
+    bullet: '*',
+    strong: '*',
+    rule: '-',
+  };
+
+  /**
+   * Escape markdown entities found in text and html nodes within the MDAST.
+   */
+  const escapedMdast = unified()
+    .use(remarkEscapeMarkdownEntities)
+    .runSync(mdast);
+
  const markdown = unified()
-    .use(remarkToMarkdownPlugin, { listItemIndent: '1', fences: true, pedantic: true, commonmark: true })
+    .use(remarkToMarkdownPlugin, remarkToMarkdownPluginOpts)
    .use(remarkAllowAllText)
-    .stringify(mdast);
+    .stringify(escapedMdast);

  return markdown;
 };
--- a/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js
+++ b/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js
@ -0,0 +1,56 @@
+/**
+ * A Remark plugin for escaping markdown entities.
+ *
+ * When markdown entities are entered in raw markdown, they don't appear as
+ * characters in the resulting AST; for example, dashes surrounding a piece of
+ * text cause the text to be inserted in a special node type, but the asterisks
+ * themselves aren't present as text. Therefore, we generally don't expect to
+ * encounter markdown characters in text nodes.
+ *
+ * However, the CMS visual editor does not interpret markdown characters, and
+ * users will expect these characters to be represented literally. In that case,
+ * we need to escape them, otherwise they'll be interpreted during
+ * stringification.
+ */
+export default function remarkEscapeMarkdownEntities() {
+  /**
+   * Escape all occurrences of '[', '*', '_', '`', and '~'.
+   */
+  function escapeCommonChars(text) {
+    return text.replace(/[\[*_`~]/g, '\\$&');
+  }
+
+  /**
+   * Runs escapeCommonChars, and also escapes '#' and '-' when found at the
+   * beginning of any node's first child node.
+   */
+  function escapeAllChars(text) {
+    const partiallyEscapedMarkdown = escapeCommonChars(text);
+    return partiallyEscapedMarkdown.replace(/^\s*([#-])/, '$`\\$1');
+  }
+
+  const transform = (node, index) => {
+    const children = node.children && node.children.map(transform);
+
+    /**
+     * Escape characters in text and html nodes only. We store a lot of normal
+     * text in html nodes to keep Remark from escaping html entities.
+     */
+    if (['text', 'html'].includes(node.type)) {
+
+      /**
+       * Escape all characters if this is the first child node, otherwise only
+       * common characters.
+       */
+      const value = index === 0 ? escapeAllChars(node.value) : escapeCommonChars(node.value);
+      return { ...node, value, children };
+    }
+
+    /**
+     * Always return nodes with recursively mapped children.
+     */
+    return {...node, children };
+  };
+
+  return transform;
+}