Skip to content

Commit

Permalink
fix(UABOT-89): fix url parsing with special symbols
Browse files Browse the repository at this point in the history
  • Loading branch information
DrSmile444 committed Dec 2, 2022
1 parent f86502f commit cd99958
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 16 deletions.
4 changes: 3 additions & 1 deletion src/services/constants/swindlers-urls.constant.ts
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,10 @@ export const EXCEPTION_DOMAINS = [
't.me/',
];

export const NON_WORD_REGEX = /\W/;

export const URL_REGEXP =
/(https?:\/\/(?:www\.|(?!www))?[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|www\.[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|(https?:\/\/(?:www\.|(?!www)))?[\dA-Za-z-]+\.\S{2,}|www\.?[\dA-Za-z]+\.\S{2,})/g;
/(https?:\/\/(?:www\.|(?!www))?[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|www\.[\dA-Za-z][\dA-Za-z-]+[\dA-Za-z]\.\S{2,}|(https?:\/\/(?:www\.|(?!www)))?[\dA-Za-z-]+\.\S{2,}|www\.?[\dA-Za-z]+\.\S{2,})\W?/g;
export const VALID_URL_REGEXP = new RegExp(
'^(https?:\\/\\/)?' + // protocol
'((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name
Expand Down
2 changes: 1 addition & 1 deletion src/services/swindlers-detect.service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ describe('SwindlersDetectService', () => {

it('should match swindler unresolved short url as spam', async () => {
const text = 'https://privat24.io/ тест';
const responseUrl = 'https://privat24.io/';
const responseUrl = 'https://privat24.io';
// eslint-disable-next-line prefer-promise-reject-errors
axiosMock.get.mockImplementationOnce(() => Promise.reject({ response: { headers: { location: responseUrl } } }));
const result = await swindlersDetectService.isSwindlerMessage(text);
Expand Down
2 changes: 1 addition & 1 deletion src/services/swindlers-urls.service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ describe('SwindlersUrlsService', () => {
axiosMock.get.mockImplementationOnce(() => Promise.resolve({ request: { res: { responseUrl: parsedUrl } } }));
const isUrlSpam = await swindlersUrlsService.isSpamUrl(parsedUrl);

expect(parsedUrl).toEqual('https://da-pay.me/');
expect(parsedUrl).toEqual('https://da-pay.me');
expect(isUrlSpam.isSpam).toEqual(true);
expect(result?.isSpam).toEqual(true);
expect(result?.rate).toEqual(200);
Expand Down
27 changes: 25 additions & 2 deletions src/services/url.service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@ describe('UrlService', () => {

console.info(text);

expect(result).toEqual(['https://url.com/', 'url.com']);
expect(result).toEqual(['https://url.com']);
});

it('should parse urls without special symbols at the end', () => {
const text = `test https://url.com/, test url.com. http://24.site/?order=946,`;
const result = urlService.parseUrls(text);

console.info(text);

expect(result).toEqual(['https://url.com', 'http://24.site/?order=946']);
});

it('should not parse invalid urls', () => {
Expand Down Expand Up @@ -51,14 +60,28 @@ describe('UrlService', () => {

expect(result).toEqual('www.orpay.me/');
});

it('should parse domain from string with /', () => {
const text = 'https://www.orpay.me/';
const result = urlService.getUrlDomain(text);

expect(result).toEqual('www.orpay.me/');
});

it('should parse domain from string without /', () => {
const text = 'https://www.orpay.me';
const result = urlService.getUrlDomain(text);

expect(result).toEqual('www.orpay.me/');
});
});

describe('processMessage', () => {
it('should process messages', () => {
const text = `https://da-pay.me/ тест`;
const parsedUrl = urlService.parseUrls(text)[0];

expect(parsedUrl).toEqual('https://da-pay.me/');
expect(parsedUrl).toEqual('https://da-pay.me');
});

it('should not process telegram message', () => {
Expand Down
33 changes: 22 additions & 11 deletions src/services/url.service.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { EXCEPTION_DOMAINS, URL_REGEXP, VALID_URL_REGEXP } from './constants';
import { removeDuplicates } from '../utils';

import { EXCEPTION_DOMAINS, NON_WORD_REGEX, URL_REGEXP, VALID_URL_REGEXP } from './constants';

export class UrlService {
/**
Expand All @@ -8,16 +10,25 @@ export class UrlService {
* @returns {string[]} - parsed urls
*/
parseUrls(message: string, strict = false): string[] {
return (message.match(URL_REGEXP) || []).filter((url) => {
const validUrl = url.slice(0, 4) === 'http' ? url : `https://${url}`;
try {
const urlInstance = new URL(validUrl);
const isNotExcluded = strict ? true : !EXCEPTION_DOMAINS.includes(urlInstance.host);
return urlInstance && isNotExcluded && VALID_URL_REGEXP.test(validUrl);
} catch {
return false;
}
});
return removeDuplicates(
(message.match(URL_REGEXP) || [])
.map((url) => {
const clearUrl = url.trim();
const noSpecialSymbolUrl = NON_WORD_REGEX.test(clearUrl.slice(-1)) ? clearUrl.slice(0, -1) : clearUrl;
const validUrl = noSpecialSymbolUrl.slice(0, 4) === 'http' ? noSpecialSymbolUrl : `https://${noSpecialSymbolUrl}`;

return validUrl.slice(-1) === '/' ? validUrl.slice(0, -1) : validUrl;
})
.filter((url) => {
try {
const urlInstance = new URL(url);
const isNotExcluded = strict ? true : !EXCEPTION_DOMAINS.includes(urlInstance.host);
return urlInstance && isNotExcluded && VALID_URL_REGEXP.test(url);
} catch {
return false;
}
}),
);
}

/**
Expand Down

0 comments on commit cd99958

Please sign in to comment.