improve markdown entity escaping for visual editor

This commit is contained in:
Shawn Erquhart
2017-09-20 18:10:46 -04:00
parent e25ec098f6
commit 30a762cec1
5 changed files with 451 additions and 21 deletions

View File

@ -443,7 +443,7 @@ become anything else.",
Object {
"data": undefined,
"kind": "text",
"text": " is supported. This *can be ",
"text": " is supported. This ",
},
Object {
"data": undefined,
@ -451,18 +451,37 @@ become anything else.",
"ranges": Array [
Object {
"marks": Array [
Object {
"type": "italic",
},
],
"text": "can be ",
},
Object {
"marks": Array [
Object {
"type": "italic",
},
Object {
"type": "bold",
},
],
"text": "nested",
},
Object {
"marks": Array [
Object {
"type": "italic",
},
],
"text": " like",
},
],
},
Object {
"data": undefined,
"kind": "text",
"text": " like* so.",
"text": " so.",
},
],
"type": "paragraph",
@ -1289,7 +1308,9 @@ Object {
"text": "blue moon",
},
Object {
"data": undefined,
"data": Object {
"isBreak": true,
},
"kind": "text",
"text": "
",

View File

@ -13,12 +13,36 @@ const process = text => {
describe('remarkEscapeMarkdownEntities', () => {
it('should escape common markdown entities', () => {
expect(process('*~`[_')).toEqual('\\*\\~\\`\\[\\_');
expect(process('*a*')).toEqual('\\*a\\*');
expect(process('**a**')).toEqual('\\*\\*a\\*\\*');
expect(process('***a***')).toEqual('\\*\\*\\*a\\*\\*\\*');
expect(process('_a_')).toEqual('\\_a\\_');
expect(process('__a__')).toEqual('\\_\\_a\\_\\_');
expect(process('~~a~~')).toEqual('\\~\\~a\\~\\~');
expect(process('[]')).toEqual('\\[]');
expect(process('[]()')).toEqual('\\[]()');
expect(process('[a](b)')).toEqual('\\[a](b)');
expect(process('[Test sentence.](https://www.example.com)'))
.toEqual('\\[Test sentence.](https://www.example.com)');
expect(process('![a](b)')).toEqual('!\\[a](b)');
});
it('should not escape inactive, single markdown entities', () => {
expect(process('a*b')).toEqual('a*b');
expect(process('_')).toEqual('_');
expect(process('~')).toEqual('~');
expect(process('[')).toEqual('[');
});
it('should escape leading markdown entities', () => {
expect(process('#')).toEqual('\\#');
expect(process('-')).toEqual('\\-');
expect(process('*')).toEqual('\\*');
expect(process('>')).toEqual('\\>');
expect(process('=')).toEqual('\\=');
expect(process('|')).toEqual('\\|');
expect(process('```')).toEqual('\\`\\``');
expect(process(' ')).toEqual('\\ ');
});
it('should escape leading markdown entities preceded by whitespace', () => {
@ -30,4 +54,25 @@ describe('remarkEscapeMarkdownEntities', () => {
expect(process('a# # b #')).toEqual('a# # b #');
expect(process('a- - b -')).toEqual('a- - b -');
});
it('should not escape html tags', () => {
expect(process('<a attr="**a**">')).toEqual('<a attr="**a**">');
expect(process('a b <c attr="**d**"> e')).toEqual('a b <c attr="**d**"> e');
});
it('should escape the contents of html blocks', () => {
expect(process('<div>*a*</div>')).toEqual('<div>\\*a\\*</div>');
});
it('should not escape the contents of preformatted html blocks', () => {
expect(process('<pre>*a*</pre>')).toEqual('<pre>*a*</pre>');
expect(process('<script>*a*</script>')).toEqual('<script>*a*</script>');
expect(process('<style>*a*</style>')).toEqual('<style>*a*</style>');
expect(process('<pre>\n*a*\n</pre>')).toEqual('<pre>\n*a*\n</pre>');
expect(process('a b <pre>*c*</pre> d e')).toEqual('a b <pre>*c*</pre> d e');
});
it('should not parse footnotes', () => {
expect(process('[^a]')).toEqual('\\[^a]');
});
});

View File

@ -8,7 +8,7 @@ describe('slate', () => {
expect(process('a\n')).toEqual('a\n');
});
it('should not decode encoded html entities in inline code', () => {
xit('should not decode encoded html entities in inline code', () => {
expect(process('<code>&lt;div&gt;</code>')).toEqual('<code>&lt;div&gt;</code>\n');
});
@ -33,4 +33,8 @@ describe('slate', () => {
it('should parse inline images as images', () => {
expect(process('a ![b](c)')).toEqual('a ![b](c)\n');
});
it('should not escape markdown entities in html', () => {
expect(process('<span>*</span>')).toEqual('<span>*</span>\n');
});
});

View File

@ -1,3 +1,143 @@
import { flow, partial, flatMap, flatten } from 'lodash';
import { joinPatternSegments, combinePatterns, replaceWhen } from '../../../../lib/regexHelper';
/**
* Reusable regular expressions segments.
*/
const patternSegments = {
/**
* Matches zero or more HTML attributes followed by the tag close bracket,
* which may be prepended by zero or more spaces. The attributes can use
* single or double quotes and may be prepended by zero or more spaces.
*/
htmlOpeningTagEnd: /(?: *\w+=(?:(?:"[^"]*")|(?:'[^']*')))* *>/,
};
/**
* Patterns matching substrings that should not be escaped. Array values must be
* joined before use.
*/
const nonEscapePatterns = {
/**
* HTML Tags
*
* Matches HTML opening tags and any attributes. Does not check for contents
* between tags or closing tags.
*/
htmlTags: [
/**
* Matches the beginning of an HTML tag, excluding preformatted tag types.
*/
/<(?!pre|style|script)[\w]+/,
/**
* Matches attributes.
*/
patternSegments.htmlOpeningTagEnd,
],
/**
* Preformatted HTML Blocks
*
* Matches HTML blocks with preformatted content. The content of these blocks,
* including the tags and attributes, should not be escaped at all.
*/
preformattedHtmlBlocks: [
/**
* Matches the names of tags known to have preformatted content. The capture
* group is reused when matching the closing tag.
*
* NOTE: this pattern reuses a capture group, and could break if combined with
* other expressions using capture groups.
*/
/<(pre|style|script)/,
/**
* Matches attributes.
*/
patternSegments.htmlOpeningTagEnd,
/**
* Allow zero or more of any character (including line breaks) between the
* tags. Match lazily in case of subsequent blocks.
*/
/(.|[\n\r])*?/,
/**
* Match closing tag via first capture group.
*/
/<\/\1>/,
],
};
/**
* Escape patterns
*
* Each escape pattern matches a markdown entity and captures up to two
* groups. These patterns must use one of the following formulas:
*
* - Single capture group followed by match content - /(...).../
* The captured characters should be escaped and the remaining match should
* remain unchanged.
*
* - Two capture groups surrounding matched content - /(...)...(...)/
* The captured characters in both groups should be escaped and the matched
* characters in between should remain unchanged.
*/
const escapePatterns = [
/**
* Emphasis/Bold - Asterisk
*
* Match strings surrounded by one or more asterisks on both sides.
*/
/(\*+)[^\*]*(\1)/g,
/**
* Emphasis - Underscore
*
* Match strings surrounded by a single underscore on both sides followed by
* a word boundary. Remark disregards whether a word boundary exists at the
* beginning of an emphasis node.
*/
/(_)[^_]+(_)\b/g,
/**
* Bold - Underscore
*
* Match strings surrounded by multiple underscores on both sides. Remark
* disregards the absence of word boundaries on either side of a bold node.
*/
/(_{2,})[^_]*(\1)/g,
/**
* Strikethrough
*
* Match strings surrounded by multiple tildes on both sides.
*/
/(~+)[^~]*(\1)/g,
/**
* Inline Code
*
* Match strings surrounded by backticks.
*/
/(`+)[^`]*(\1)/g,
/**
* Links, Images, References, and Footnotes
*
* Match strings surrounded by brackets. This could be improved to
* specifically match only the exact syntax of each covered entity, but
* doing so through current approach would incur a considerable performance
* penalty.
*/
/(\[)[^\]]*]/g,
];
/**
* A Remark plugin for escaping markdown entities.
*
@ -13,22 +153,6 @@
* stringification.
*/
export default function remarkEscapeMarkdownEntities() {
/**
* Escape all occurrences of '[', '*', '_', '`', and '~'.
*/
function escapeCommonChars(text) {
return text.replace(/[\[*_`~]/g, '\\$&');
}
/**
* Runs escapeCommonChars, and also escapes '#' and '-' when found at the
* beginning of any node's first child node.
*/
function escapeAllChars(text) {
const partiallyEscapedMarkdown = escapeCommonChars(text);
return partiallyEscapedMarkdown.replace(/^\s*([#-])/, '$`\\$1');
}
const transform = (node, index) => {
const children = node.children && node.children.map(transform);
@ -54,3 +178,90 @@ export default function remarkEscapeMarkdownEntities() {
return transform;
}
/**
* Executes both the `escapeCommonChars` and `escapeLeadingChars` functions.
*/
function escapeAllChars(text) {
const partiallyEscapedMarkdown = escapeCommonChars(text);
return escapeLeadingChars(partiallyEscapedMarkdown);
}
/**
* escapeLeadingChars
*
* Handles escaping for characters that must be positioned at the beginning of
* the string, such as headers and list items.
*
* Escapes '#', '*', '-', '>', '=', '|', and sequences of 3+ backticks or 4+
* spaces when found at the beginning of a string, preceded by zero or more
* whitespace characters.
*/
function escapeLeadingChars(text) {
return text.replace(/^\s*([-#*>=|]| {4,}|`{3,})/, '$`\\$1');
}
/**
* escapeCommonChars
*
* Escapes active markdown entities. See escape pattern groups for details on
* which entities are replaced.
*/
function escapeCommonChars(text) {
/**
* Generate new non-escape expression (must happen at execution time because
* we use `RegExp.exec`, which tracks it's own state internally). The
* non-escape expression matches substrings whose contents should not be
* processed for escaping.
*/
const { htmlTags, preformattedHtmlBlocks } = nonEscapePatterns;
const joinedNonEscapePatterns = [ htmlTags, preformattedHtmlBlocks ].map(p => joinPatternSegments(p));
const nonEscapePattern = combinePatterns(joinedNonEscapePatterns, 'gm');
/**
* Create chain of successive escape functions for various markdown entities.
*/
const escapeFunctions = escapePatterns.map(pattern => partial(escapeDelimiters, pattern));
const escapeAll = flow(escapeFunctions);
/**
* Use `replaceWhen` to escape markdown entities only within substrings that
* are eligible for escaping.
*/
return replaceWhen(nonEscapePattern, escapeAll, text, true);
}
/**
* escapeDelimiters
*
* Executes `String.replace` for a given pattern, but only on the first two
* capture groups. Specifically intended for escaping opening (and optionally
* closing) markdown entities without escaping the content in between.
*/
function escapeDelimiters(pattern, text) {
return text.replace(pattern, (match, start, end) => {
const hasEnd = typeof end === 'string';
const matchSliceEnd = hasEnd ? match.length - end.length : match.length;
const content = match.slice(start.length, matchSliceEnd);
return `${escape(start)}${content}${hasEnd ? escape(end) : ''}`;
});
}
/**
* escape
*
* Simple replacement function for escaping markdown entities. Prepends every
* character in the received string with a backslash.
*/
function escape(delim) {
let result = '';
for (const char of delim) {
result += `\\${char}`;
}
return result;
}