Merge pull request #640 from netlify/utf8-slugs

Support Unicode characters in slugs
This commit is contained in:
Shawn Erquhart 2017-10-04 10:34:42 -04:00 committed by GitHub
commit 876cb2ca94
5 changed files with 172 additions and 16 deletions

View File

@ -149,12 +149,12 @@
"remark-parse": "^3.0.1",
"remark-rehype": "^2.0.0",
"remark-stringify": "^3.0.1",
"sanitize-filename": "^1.6.1",
"semaphore": "^1.0.5",
"slate": "^0.21.0",
"slate-edit-list": "^0.7.1",
"slate-edit-table": "^0.10.1",
"slate-soft-break": "^0.3.0",
"slug": "^0.9.1",
"toml-j0.4": "^1.1.1",
"unified": "^6.1.4",
"unist-builder": "^1.0.2",

View File

@ -5,7 +5,7 @@ import GitGatewayBackend from "./git-gateway/implementation";
import { resolveFormat } from "../formats/formats";
import { selectListMethod, selectEntrySlug, selectEntryPath, selectAllowNewEntries, selectFolderEntryExtension } from "../reducers/collections";
import { createEntry } from "../valueObjects/Entry";
import slug from 'slug';
import { sanitizeSlug } from "../lib/urlHelper";
class LocalStorageAuthStore {
storageKey = "netlify-cms-user";
@ -42,7 +42,7 @@ const slugFormatter = (template = "{{slug}}", entryData) => {
return identifier;
};
return template.replace(/\{\{([^\}]+)\}\}/g, (_, field) => {
const slug = template.replace(/\{\{([^\}]+)\}\}/g, (_, field) => {
switch (field) {
case "year":
return date.getFullYear();
@ -51,11 +51,18 @@ const slugFormatter = (template = "{{slug}}", entryData) => {
case "day":
return (`0${ date.getDate() }`).slice(-2);
case "slug":
return slug(getIdentifier(entryData).trim(), {lower: true});
return getIdentifier(entryData).trim();
default:
return slug(entryData.get(field, "").trim(), {lower: true});
return entryData.get(field, "").trim();
}
});
})
// Convert slug to lower-case
.toLocaleLowerCase()
// Replace periods and spaces with dashes.
.replace(/[.\s]/g, '-');
return sanitizeSlug(slug);
};
class Backend {

View File

@ -0,0 +1,97 @@
import { sanitizeIRI, sanitizeSlug } from '../urlHelper';
describe('sanitizeIRI', () => {
// `sanitizeIRI` tests from RFC 3987
it('should keep valid URI chars (letters digits _ - . ~)', () => {
expect(
sanitizeIRI("This, that-one_or.the~other 123!")
).toEqual('Thisthat-one_or.the~other123');
});
it('should not remove accents', () => {
expect(
sanitizeIRI("ěščřžý")
).toEqual('ěščřžý');
});
it('should keep valid non-latin chars (ucschars in RFC 3987)', () => {
expect(
sanitizeIRI("日本語のタイトル")
).toEqual('日本語のタイトル');
});
it('should not normalize Unicode strings', () => {
expect(
sanitizeIRI('\u017F\u0323\u0307')
).toEqual('\u017F\u0323\u0307');
expect(
sanitizeIRI('\u017F\u0323\u0307')
).not.toEqual('\u1E9B\u0323');
});
it('should allow a custom replacement character', () => {
expect(
sanitizeIRI("duck\\goose.elephant", { replacement: '-' })
).toEqual('duck-goose.elephant');
});
it('should not allow an improper replacement character', () => {
expect(() => {
sanitizeIRI("I! like! dollars!", { replacement: '$' });
}).toThrow();
});
it('should not actually URI-encode the characters', () => {
expect(
sanitizeIRI("🎉")
).toEqual('🎉');
expect(
sanitizeIRI("🎉")
).not.toEqual("%F0%9F%8E%89");
});
});
describe('sanitizeSlug', ()=> {
it('throws an error for non-strings', () => {
expect(() => sanitizeSlug({})).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug([])).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug(false)).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug(null)).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug(11234)).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug(undefined)).toThrowError("The input slug must be a string.");
expect(() => sanitizeSlug(()=>{})).toThrowError("The input slug must be a string.");
});
it('throws an error for non-string replacements', () => {
expect(() => sanitizeSlug('test', { replacement: {} })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: [] })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: false })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: null } )).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: 11232 })).toThrowError("`options.replacement` must be a string.");
// do not test undefined for this variant since a default is set in the cosntructor.
//expect(() => sanitizeSlug('test', { replacement: undefined })).toThrowError("`options.replacement` must be a string.");
expect(() => sanitizeSlug('test', { replacement: ()=>{} })).toThrowError("`options.replacement` must be a string.");
});
it('should keep valid URI chars (letters digits _ - . ~)', () => {
expect(
sanitizeSlug("This, that-one_or.the~other 123!")
).toEqual('This-that-one_or.the~other-123');
});
it('removes double replacements', () => {
expect(sanitizeSlug('test--test')).toEqual('test-test');
expect(sanitizeSlug('test test')).toEqual('test-test');
});
it('removes trailing replacemenets', () => {
expect(sanitizeSlug('test test ')).toEqual('test-test');
});
it('uses alternate replacements', () => {
expect(sanitizeSlug('test test ', { replacement: '_' })).toEqual('test_test');
});
});

View File

@ -1,4 +1,6 @@
import url from 'url';
import sanitizeFilename from 'sanitize-filename';
import { isString, escapeRegExp, flow, partialRight } from 'lodash';
function getUrl(url, direct) {
return `${ direct ? '/#' : '' }${ url }`;
@ -12,6 +14,50 @@ export function getNewEntryUrl(collectionName, direct) {
return getUrl(`/collections/${ collectionName }/entries/new`, direct);
}
/* See https://www.w3.org/International/articles/idn-and-iri/#path.
* According to the new IRI (Internationalized Resource Identifier) spec, RFC 3987,
* ASCII chars should be kept the same way as in standard URIs (letters digits _ - . ~).
* Non-ASCII chars (unless they are not in the allowed "ucschars" list) should be percent-encoded.
* If the string is not encoded in Unicode, it should be converted to UTF-8 and normalized first,
* but JS stores strings as UTF-16/UCS-2 internally, so we should not normallize or re-encode.
*/
const uriChars = /[\w\-.~]/i;
const ucsChars = /[\xA0-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/u;
const validIRIChar = (char) => (uriChars.test(char) || ucsChars.test(char));
// `sanitizeIRI` does not actually URI-encode the chars (that is the browser's and server's job), just removes the ones that are not allowed.
export function sanitizeIRI(str, { replacement = "" } = {}) {
if (!isString(str)) throw "The input slug must be a string.";
if (!isString(replacement)) throw "`options.replacement` must be a string.";
// Check and make sure the replacement character is actually a safe char itself.
if (!Array.from(replacement).every(validIRIChar)) throw "The replacement character(s) (options.replacement) is itself unsafe.";
// `Array.from` must be used instead of `String.split` because
// `split` converts things like emojis into UTF-16 surrogate pairs.
return Array.from(str).map(char => (validIRIChar(char) ? char : replacement)).join('');
}
export function sanitizeSlug(str, { replacement = '-' } = {}) {
if (!isString(str)) throw "The input slug must be a string.";
if (!isString(replacement)) throw "`options.replacement` must be a string.";
// Sanitize as IRI (i18n URI) and as filename.
const sanitize = flow([
partialRight(sanitizeIRI, { replacement }),
partialRight(sanitizeFilename, { replacement }),
]);
const sanitizedSlug = sanitize(str);
// Remove any doubled or trailing replacement characters (that were added in the sanitizers).
const doubleReplacement = new RegExp('(?:' + escapeRegExp(replacement) + ')+', 'g');
const trailingReplacment = new RegExp(escapeRegExp(replacement) + '$');
const normalizedSlug = sanitizedSlug
.replace(doubleReplacement, replacement)
.replace(trailingReplacment, '');
return normalizedSlug;
}
export function urlize(string) {
const sanitized = makePathSanitized(string);
const parsedURL = url.parse(sanitized);

View File

@ -7622,6 +7622,12 @@ sane@~1.6.0:
walker "~1.0.5"
watch "~0.10.0"
sanitize-filename@^1.6.1:
version "1.6.1"
resolved "https://registry.yarnpkg.com/sanitize-filename/-/sanitize-filename-1.6.1.tgz#612da1c96473fa02dccda92dcd5b4ab164a6772a"
dependencies:
truncate-utf8-bytes "^1.0.0"
sax@^1.2.1, sax@~1.2.1:
version "1.2.4"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
@ -7815,12 +7821,6 @@ slice-ansi@0.0.4:
version "0.0.4"
resolved "https://registry.yarnpkg.com/slice-ansi/-/slice-ansi-0.0.4.tgz#edbf8903f66f7ce2f8eafd6ceed65e264c831b35"
slug@^0.9.1:
version "0.9.1"
resolved "https://registry.yarnpkg.com/slug/-/slug-0.9.1.tgz#af08f608a7c11516b61778aa800dce84c518cfda"
dependencies:
unicode ">= 0.3.1"
sntp@1.x.x:
version "1.0.9"
resolved "https://registry.yarnpkg.com/sntp/-/sntp-1.0.9.tgz#6541184cc90aeea6c6e7b35e2659082443c66198"
@ -8547,6 +8547,12 @@ trough@^1.0.0:
version "1.0.1"
resolved "https://registry.yarnpkg.com/trough/-/trough-1.0.1.tgz#a9fd8b0394b0ae8fff82e0633a0a36ccad5b5f86"
truncate-utf8-bytes@^1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/truncate-utf8-bytes/-/truncate-utf8-bytes-1.0.2.tgz#405923909592d56f78a5818434b0b78489ca5f2b"
dependencies:
utf8-byte-length "^1.0.1"
tryit@^1.0.1:
version "1.0.3"
resolved "https://registry.yarnpkg.com/tryit/-/tryit-1.0.3.tgz#393be730a9446fd1ead6da59a014308f36c289cb"
@ -8635,10 +8641,6 @@ unherit@^1.0.4:
inherits "^2.0.1"
xtend "^4.0.1"
"unicode@>= 0.3.1":
version "9.0.1"
resolved "https://registry.yarnpkg.com/unicode/-/unicode-9.0.1.tgz#104706272c6464c574801be1b086f7245cf25158"
unified@^6.1.4:
version "6.1.5"
resolved "https://registry.yarnpkg.com/unified/-/unified-6.1.5.tgz#716937872621a63135e62ced2f3ac6a063c6fb87"
@ -8793,6 +8795,10 @@ user-home@^2.0.0:
dependencies:
os-homedir "^1.0.0"
utf8-byte-length@^1.0.1:
version "1.0.4"
resolved "https://registry.yarnpkg.com/utf8-byte-length/-/utf8-byte-length-1.0.4.tgz#f45f150c4c66eee968186505ab93fcbb8ad6bf61"
util-deprecate@~1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"