diff --git a/src/components/Widgets/Markdown/serializers/__tests__/remarkEscapeMarkdownEntities.spec.js b/src/components/Widgets/Markdown/serializers/__tests__/remarkEscapeMarkdownEntities.spec.js new file mode 100644 index 00000000..515ae75f --- /dev/null +++ b/src/components/Widgets/Markdown/serializers/__tests__/remarkEscapeMarkdownEntities.spec.js @@ -0,0 +1,33 @@ +import unified from 'unified'; +import u from 'unist-builder'; +import remarkEscapeMarkdownEntities from '../remarkEscapeMarkdownEntities'; + +const process = text => { + const tree = u('root', [ u('text', text) ]); + const escapedMdast = unified() + .use(remarkEscapeMarkdownEntities) + .runSync(tree); + + return escapedMdast.children[0].value; +}; + +describe('remarkEscapeMarkdownEntities', () => { + it('should escape common markdown entities', () => { + expect(process('*~`[_')).toEqual('\\*\\~\\`\\[\\_'); + }); + + it('should escape leading markdown entities', () => { + expect(process('#')).toEqual('\\#'); + expect(process('-')).toEqual('\\-'); + }); + + it('should escape leading markdown entities preceded by whitespace', () => { + expect(process('\n #')).toEqual('\\#'); + expect(process(' \n-')).toEqual('\\-'); + }); + + it('should not escape leading markdown entities preceded by non-whitespace characters', () => { + expect(process('a# # b #')).toEqual('a# # b #'); + expect(process('a- - b -')).toEqual('a- - b -'); + }); +}); diff --git a/src/components/Widgets/Markdown/serializers/index.js b/src/components/Widgets/Markdown/serializers/index.js index e524db50..9fff5db6 100644 --- a/src/components/Widgets/Markdown/serializers/index.js +++ b/src/components/Widgets/Markdown/serializers/index.js @@ -16,6 +16,7 @@ import remarkToSlatePlugin from './remarkSlate'; import remarkSquashReferences from './remarkSquashReferences'; import remarkImagesToText from './remarkImagesToText'; import remarkShortcodes from './remarkShortcodes'; +import remarkEscapeMarkdownEntities from './remarkEscapeMarkdownEntities' import slateToRemarkParser from './slateRemark'; import registry from '../../../../lib/registry'; @@ -164,10 +165,30 @@ export const remarkToMarkdown = obj => { */ const mdast = obj || u('root', [u('paragraph', [u('text', '')])]); + const remarkToMarkdownPluginOpts = { + commonmark: true, + fences: true, + pedantic: true, + listItemIndent: '1', + + // Settings to emulate the defaults from the Prosemirror editor, not + // necessarily optimal. Should eventually be configurable. + bullet: '*', + strong: '*', + rule: '-', + }; + + /** + * Escape markdown entities found in text and html nodes within the MDAST. + */ + const escapedMdast = unified() + .use(remarkEscapeMarkdownEntities) + .runSync(mdast); + const markdown = unified() - .use(remarkToMarkdownPlugin, { listItemIndent: '1', fences: true, pedantic: true, commonmark: true }) + .use(remarkToMarkdownPlugin, remarkToMarkdownPluginOpts) .use(remarkAllowAllText) - .stringify(mdast); + .stringify(escapedMdast); return markdown; }; diff --git a/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js b/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js new file mode 100644 index 00000000..d38bc8bd --- /dev/null +++ b/src/components/Widgets/Markdown/serializers/remarkEscapeMarkdownEntities.js @@ -0,0 +1,56 @@ +/** + * A Remark plugin for escaping markdown entities. + * + * When markdown entities are entered in raw markdown, they don't appear as + * characters in the resulting AST; for example, dashes surrounding a piece of + * text cause the text to be inserted in a special node type, but the asterisks + * themselves aren't present as text. Therefore, we generally don't expect to + * encounter markdown characters in text nodes. + * + * However, the CMS visual editor does not interpret markdown characters, and + * users will expect these characters to be represented literally. In that case, + * we need to escape them, otherwise they'll be interpreted during + * stringification. + */ +export default function remarkEscapeMarkdownEntities() { + /** + * Escape all occurrences of '[', '*', '_', '`', and '~'. + */ + function escapeCommonChars(text) { + return text.replace(/[\[*_`~]/g, '\\$&'); + } + + /** + * Runs escapeCommonChars, and also escapes '#' and '-' when found at the + * beginning of any node's first child node. + */ + function escapeAllChars(text) { + const partiallyEscapedMarkdown = escapeCommonChars(text); + return partiallyEscapedMarkdown.replace(/^\s*([#-])/, '$`\\$1'); + } + + const transform = (node, index) => { + const children = node.children && node.children.map(transform); + + /** + * Escape characters in text and html nodes only. We store a lot of normal + * text in html nodes to keep Remark from escaping html entities. + */ + if (['text', 'html'].includes(node.type)) { + + /** + * Escape all characters if this is the first child node, otherwise only + * common characters. + */ + const value = index === 0 ? escapeAllChars(node.value) : escapeCommonChars(node.value); + return { ...node, value, children }; + } + + /** + * Always return nodes with recursively mapped children. + */ + return {...node, children }; + }; + + return transform; +}