From cd99958a02c3e9907f64160ecab3c5d8f9264fe4 Mon Sep 17 00:00:00 2001 From: Dmytro Vakulenko Date: Fri, 2 Dec 2022 13:13:37 +0200 Subject: [PATCH] fix(UABOT-89): fix url parsing with special symbols --- .../constants/swindlers-urls.constant.ts | 4 ++- src/services/swindlers-detect.service.test.ts | 2 +- src/services/swindlers-urls.service.test.ts | 2 +- src/services/url.service.test.ts | 27 +++++++++++++-- src/services/url.service.ts | 33 ++++++++++++------- 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/services/constants/swindlers-urls.constant.ts b/src/services/constants/swindlers-urls.constant.ts index 1549abc3..4d127ed7 100644 --- a/src/services/constants/swindlers-urls.constant.ts +++ b/src/services/constants/swindlers-urls.constant.ts @@ -444,8 +444,10 @@ export const EXCEPTION_DOMAINS = [ 't.me/', ]; +export const NON_WORD_REGEX = /\W/; + export const URL_REGEXP = - /(https?:\/\/(?:www\.|(?!www))?[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|www\.[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|(https?:\/\/(?:www\.|(?!www)))?[\dA-Za-z-]+\.\S{2,}|www\.?[\dA-Za-z]+\.\S{2,})/g; + /(https?:\/\/(?:www\.|(?!www))?[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|www\.[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|(https?:\/\/(?:www\.|(?!www)))?[\dA-Za-z-]+\.\S{2,}|www\.?[\dA-Za-z]+\.\S{2,})\W?/g; export const VALID_URL_REGEXP = new RegExp( '^(https?:\\/\\/)?' + // protocol '((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name diff --git a/src/services/swindlers-detect.service.test.ts b/src/services/swindlers-detect.service.test.ts index 361b93f5..6d0082ea 100644 --- a/src/services/swindlers-detect.service.test.ts +++ b/src/services/swindlers-detect.service.test.ts @@ -53,7 +53,7 @@ describe('SwindlersDetectService', () => { it('should match swindler unresolved short url as spam', async () => { const text = 'https://privat24.io/ тест'; - const responseUrl = 'https://privat24.io/'; + const responseUrl = 'https://privat24.io'; // eslint-disable-next-line prefer-promise-reject-errors axiosMock.get.mockImplementationOnce(() => Promise.reject({ response: { headers: { location: responseUrl } } })); const result = await swindlersDetectService.isSwindlerMessage(text); diff --git a/src/services/swindlers-urls.service.test.ts b/src/services/swindlers-urls.service.test.ts index 8beb5fec..82a0bd51 100644 --- a/src/services/swindlers-urls.service.test.ts +++ b/src/services/swindlers-urls.service.test.ts @@ -105,7 +105,7 @@ describe('SwindlersUrlsService', () => { axiosMock.get.mockImplementationOnce(() => Promise.resolve({ request: { res: { responseUrl: parsedUrl } } })); const isUrlSpam = await swindlersUrlsService.isSpamUrl(parsedUrl); - expect(parsedUrl).toEqual('https://da-pay.me/'); + expect(parsedUrl).toEqual('https://da-pay.me'); expect(isUrlSpam.isSpam).toEqual(true); expect(result?.isSpam).toEqual(true); expect(result?.rate).toEqual(200); diff --git a/src/services/url.service.test.ts b/src/services/url.service.test.ts index 4a426f29..23e4f58e 100644 --- a/src/services/url.service.test.ts +++ b/src/services/url.service.test.ts @@ -9,7 +9,16 @@ describe('UrlService', () => { console.info(text); - expect(result).toEqual(['https://url.com/', 'url.com']); + expect(result).toEqual(['https://url.com']); + }); + + it('should parse urls without special symbols at the end', () => { + const text = `test https://url.com/, test url.com. http://24.site/?order=946,`; + const result = urlService.parseUrls(text); + + console.info(text); + + expect(result).toEqual(['https://url.com', 'http://24.site/?order=946']); }); it('should not parse invalid urls', () => { @@ -51,6 +60,20 @@ describe('UrlService', () => { expect(result).toEqual('www.orpay.me/'); }); + + it('should parse domain from string with /', () => { + const text = 'https://www.orpay.me/'; + const result = urlService.getUrlDomain(text); + + expect(result).toEqual('www.orpay.me/'); + }); + + it('should parse domain from string without /', () => { + const text = 'https://www.orpay.me'; + const result = urlService.getUrlDomain(text); + + expect(result).toEqual('www.orpay.me/'); + }); }); describe('processMessage', () => { @@ -58,7 +81,7 @@ describe('UrlService', () => { const text = `https://da-pay.me/ тест`; const parsedUrl = urlService.parseUrls(text)[0]; - expect(parsedUrl).toEqual('https://da-pay.me/'); + expect(parsedUrl).toEqual('https://da-pay.me'); }); it('should not process telegram message', () => { diff --git a/src/services/url.service.ts b/src/services/url.service.ts index 0c1e08b1..b894b936 100644 --- a/src/services/url.service.ts +++ b/src/services/url.service.ts @@ -1,4 +1,6 @@ -import { EXCEPTION_DOMAINS, URL_REGEXP, VALID_URL_REGEXP } from './constants'; +import { removeDuplicates } from '../utils'; + +import { EXCEPTION_DOMAINS, NON_WORD_REGEX, URL_REGEXP, VALID_URL_REGEXP } from './constants'; export class UrlService { /** @@ -8,16 +10,25 @@ export class UrlService { * @returns {string[]} - parsed urls */ parseUrls(message: string, strict = false): string[] { - return (message.match(URL_REGEXP) || []).filter((url) => { - const validUrl = url.slice(0, 4) === 'http' ? url : `https://${url}`; - try { - const urlInstance = new URL(validUrl); - const isNotExcluded = strict ? true : !EXCEPTION_DOMAINS.includes(urlInstance.host); - return urlInstance && isNotExcluded && VALID_URL_REGEXP.test(validUrl); - } catch { - return false; - } - }); + return removeDuplicates( + (message.match(URL_REGEXP) || []) + .map((url) => { + const clearUrl = url.trim(); + const noSpecialSymbolUrl = NON_WORD_REGEX.test(clearUrl.slice(-1)) ? clearUrl.slice(0, -1) : clearUrl; + const validUrl = noSpecialSymbolUrl.slice(0, 4) === 'http' ? noSpecialSymbolUrl : `https://${noSpecialSymbolUrl}`; + + return validUrl.slice(-1) === '/' ? validUrl.slice(0, -1) : validUrl; + }) + .filter((url) => { + try { + const urlInstance = new URL(url); + const isNotExcluded = strict ? true : !EXCEPTION_DOMAINS.includes(urlInstance.host); + return urlInstance && isNotExcluded && VALID_URL_REGEXP.test(url); + } catch { + return false; + } + }), + ); } /**