Simplify linkification filter - check for ASCII/non-ASCII only

This commit is contained in:
Scott Nonnenberg 2019-12-18 11:45:11 -08:00 committed by Ken Powers
parent f693768bcf
commit f5be32ba14
2 changed files with 15 additions and 134 deletions

View File

@ -212,134 +212,7 @@ function assembleChunks(chunkDescriptors) {
return concatenateBytes(...chunks);
}
const LATIN_PATTERN = new RegExp(
'[' +
'\\u0041-\\u005A' +
'\\u0061-\\u007A' +
'\\u00AA' +
'\\u00BA' +
'\\u00C0-\\u00DC' +
'\\u00D8-\\u00F6' +
'\\u00F8-\\u01BA' +
']'
);
const CYRILLIC_PATTERN = new RegExp(
'[' +
'\\u0400-\\u0481' +
'\\u0482' +
'\\u0483-\\u0484' +
'\\u0487' +
'\\u0488-\\u0489' +
'\\u048A-\\u052F' +
'\\u1C80-\\u1C88' +
'\\u1D2B' +
'\\u1D78' +
'\\u2DE0-\\u2DFF' +
'\\uA640-\\uA66D' +
'\\uA66E' +
'\\uA66F' +
'\\uA670-\\uA672' +
'\\uA673' +
'\\uA674-\\uA67D' +
'\\uA67E' +
'\\uA67F' +
'\\uA680-\\uA69B' +
'\\uA69C-\\uA69D' +
'\\uA69E-\\uA69F' +
'\\uFE2E-\\uFE2F' +
']'
);
const GREEK_PATTERN = new RegExp(
'[' +
'\\u0370-\\u0373' +
'\\u0375' +
'\\u0376-\\u0377' +
'\\u037A' +
'\\u037B-\\u037D' +
'\\u037F' +
'\\u0384' +
'\\u0386' +
'\\u0388-\\u038A' +
'\\u038C' +
'\\u038E-\\u03A1' +
'\\u03A3-\\u03E1' +
'\\u03F0-\\u03F5' +
'\\u03F6' +
'\\u03F7-\\u03FF' +
'\\u1D26-\\u1D2A' +
'\\u1D5D-\\u1D61' +
'\\u1D66-\\u1D6A' +
'\\u1DBF' +
'\\u1F00-\\u1F15' +
'\\u1F18-\\u1F1D' +
'\\u1F20-\\u1F45' +
'\\u1F48-\\u1F4D' +
'\\u1F50-\\u1F57' +
'\\u1F59' +
'\\u1F5B' +
'\\u1F5D' +
'\\u1F5F-\\u1F7D' +
'\\u1F80-\\u1FB4' +
'\\u1FB6-\\u1FBC' +
'\\u1FBD' +
'\\u1FBE' +
'\\u1FBF-\\u1FC1' +
'\\u1FC2-\\u1FC4' +
'\\u1FC6-\\u1FCC' +
'\\u1FCD-\\u1FCF' +
'\\u1FD0-\\u1FD3' +
'\\u1FD6-\\u1FDB' +
'\\u1FDD-\\u1FDF' +
'\\u1FE0-\\u1FEC' +
'\\u1FED-\\u1FEF' +
'\\u1FF2-\\u1FF4' +
'\\u1FF6-\\u1FFC' +
'\\u1FFD-\\u1FFE' +
'\\u2126' +
'\\uAB65' +
']'
);
const HIGH_GREEK_PATTERN = new RegExp(
'[' +
`${String.fromCodePoint(0x10140)}-${String.fromCodePoint(0x10174)}` +
`${String.fromCodePoint(0x10175)}-${String.fromCodePoint(0x10178)}` +
`${String.fromCodePoint(0x10179)}-${String.fromCodePoint(0x10189)}` +
`${String.fromCodePoint(0x1018a)}-${String.fromCodePoint(0x1018b)}` +
`${String.fromCodePoint(0x1018c)}-${String.fromCodePoint(0x1018e)}` +
`${String.fromCodePoint(0x101a0)}` +
`${String.fromCodePoint(0x1d200)}-${String.fromCodePoint(0x1d241)}` +
`${String.fromCodePoint(0x1d242)}-${String.fromCodePoint(0x1d244)}` +
`${String.fromCodePoint(0x1d245)}` +
']',
'u'
);
function isChunkSneaky(chunk) {
const hasLatin = LATIN_PATTERN.test(chunk);
if (!hasLatin) {
return false;
}
const hasCyrillic = CYRILLIC_PATTERN.test(chunk);
if (hasCyrillic) {
return true;
}
const hasGreek = GREEK_PATTERN.test(chunk);
if (hasGreek) {
return true;
}
const hasHighGreek = HIGH_GREEK_PATTERN.test(chunk);
if (hasHighGreek) {
return true;
}
return false;
}
const ASCII_PATTERN = new RegExp('[\\u0000-\\u007F]', 'g');
function isLinkSneaky(link) {
const domain = getDomain(link);
@ -350,12 +223,14 @@ function isLinkSneaky(link) {
? nodeUrl.domainToUnicode(domain)
: domain;
const chunks = unicodeDomain.split('.');
for (let i = 0, max = chunks.length; i < max; i += 1) {
const chunk = chunks[i];
if (isChunkSneaky(chunk)) {
return true;
}
const withoutPeriods = unicodeDomain.replace(/\./g, '');
const hasASCII = ASCII_PATTERN.test(withoutPeriods);
const withoutASCII = withoutPeriods.replace(ASCII_PATTERN, '');
const isMixed = hasASCII && withoutASCII.length > 0;
if (isMixed) {
return true;
}
return false;

View File

@ -360,6 +360,12 @@ describe('Link previews', () => {
assert.strictEqual(actual, true);
});
it('returns true for ASCII and non-ASCII mix', () => {
const link = 'https://www.аррӏе.com';
const actual = isLinkSneaky(link);
assert.strictEqual(actual, true);
});
it('returns true for Latin + High Greek domain', () => {
const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`;
const actual = isLinkSneaky(link);