diff --git a/index.js b/index.js index 53044c3..c375067 100644 --- a/index.js +++ b/index.js @@ -5,6 +5,7 @@ import httpClientGen from './utils/http-client.js' import dnsLookupGen from './utils/dns-lookup.js' import logger from './utils/logger.js' import CacheableLookup from 'cacheable-lookup' +import { gotSsrf } from 'got-ssrf' const debug = logger('index.js') @@ -20,14 +21,26 @@ export default ( timeout: { request: 14000 // global timeout }, - cache: new QuickLRU({ maxSize: 1000 }), - dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 10000 }) }) + cache: new QuickLRU({ maxSize: 10000 }), + dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 100000 }) }) }, - timeoutMs = 15000 // global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit + timeoutMs = 15000, + canonicizeMemOpts = { + cache: new QuickLRU({ maxSize: 100000 }), + cachePromiseRejection: true, + maxAge: 86400000 // 24 hours + }, + stripTrackersMemOpts = { cache: new QuickLRU({ maxSize: 100000 }) } + // The cache numbers are pulled from the most reliable source on the internet: my ass. ) => { - const httpClient = httpClientGen(gotOptions) const dnsLookup = dnsLookupGen(gotOptions) - const normalize = normalizeUrl(normalizeUrlOptions, dnsLookup, httpClient) + const normalize = normalizeUrl( + normalizeUrlOptions, + dnsLookup, + gotSsrf.extend(gotOptions), // don't really need to mimic browser behaviour or canonicize shit + stripTrackersMemOpts + ) + const httpClient = httpClientGen(normalize, gotOptions, canonicizeMemOpts) // Normalize URL so that we can search by URL. async function normalizePlus(url = '') { @@ -39,12 +52,13 @@ export default ( debug('Normalization first pass: %s', url) // 2. Follow redirects to deal with "intermediate" links (such as the links on google search results) - const res = await httpClient.get(link, { context: { normalize } }) + const res = await httpClient.get(link) debug('Normalization second pass: %s', res.url) // At this point, the link will be completely normalized based on canonical links (if one exists) return res.url } + // global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit return url => pTimeout(normalizePlus(url), timeoutMs) } diff --git a/package-lock.json b/package-lock.json index 69229eb..c783266 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,11 +13,13 @@ "debug": "^4.3.2", "got": "^12.0.0-beta.4", "got-scraping": "^3.0.1", - "got-ssrf": "^1.0.2", + "got-ssrf": "^1.1.0", "ipaddr.js": "^2.0.1", "leven": "^4.0.0", "lodash": "^4.17.21", + "mem": "^9.0.1", "normalize-url": "^7.0.1", + "p-memoize": "^4.0.1", "p-timeout": "^5.0.0", "quick-lru": "^6.0.1", "tld-extract": "^2.0.1" @@ -3430,10 +3432,11 @@ } }, "node_modules/got-ssrf": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.0.2.tgz", - "integrity": "sha512-9AkQhk97CXekxF1VLugiVBBHDQK96r/AgH0Z+iwgOW/fH97AblOhroLzblKnUwSkvQBFMqNacKCmUfsbc+Xq9w==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.1.0.tgz", + "integrity": "sha512-XsoB8S+0FqrnFGLu8M4HMZa3T+spFpVPQ0A/MDIl2U6KomrYbYUTWou3lGa7fvniqcP6R9dObkFZeAA5so8fNA==", "dependencies": { + "debug": "^4.3.2", "got": "^12.0.0-beta.4", "ipaddr.js": "^2.0.1" }, @@ -5386,6 +5389,43 @@ "tmpl": "1.0.x" } }, + "node_modules/map-age-cleaner": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz", + "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==", + "dependencies": { + "p-defer": "^1.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mem": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/mem/-/mem-9.0.1.tgz", + "integrity": "sha512-f4uEX3Ley9FZqcFIRSBr2q43x1bJQeDvsxgkSN/BPnA7jY9Aue4sBU2dsjmpDwiaY/QY1maNCeosbUHQWzzdQw==", + "dependencies": { + "map-age-cleaner": "^0.1.3", + "mimic-fn": "^4.0.0" + }, + "engines": { + "node": ">=12.20" + }, + "funding": { + "url": "https://github.com/sindresorhus/mem?sponsor=1" + } + }, + "node_modules/mem/node_modules/mimic-fn": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz", + "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/memorystream": { "version": "0.3.1", "resolved": "https://registry.npmjs.org/memorystream/-/memorystream-0.3.1.tgz", @@ -5952,6 +5992,14 @@ "node": ">=8" } }, + "node_modules/p-defer": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-defer/-/p-defer-1.0.0.tgz", + "integrity": "sha1-n26xgvbJqozXQwBKfU+WsZaw+ww=", + "engines": { + "node": ">=4" + } + }, "node_modules/p-each-series": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz", @@ -6003,6 +6051,44 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/p-memoize": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/p-memoize/-/p-memoize-4.0.1.tgz", + "integrity": "sha512-km0sP12uE0dOZ5qP+s7kGVf07QngxyG0gS8sYFvFWhqlgzOsSy+m71aUejf/0akxj5W7gE//2G74qTv6b4iMog==", + "dependencies": { + "mem": "^6.0.1", + "mimic-fn": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sindresorhus/p-memoize?sponsor=1" + } + }, + "node_modules/p-memoize/node_modules/mem": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/mem/-/mem-6.1.1.tgz", + "integrity": "sha512-Ci6bIfq/UgcxPTYa8dQQ5FY3BzKkT894bwXWXxC/zqs0XgMO2cT20CGkOqda7gZNkmK5VP4x89IGZ6K7hfbn3Q==", + "dependencies": { + "map-age-cleaner": "^0.1.3", + "mimic-fn": "^3.0.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sindresorhus/mem?sponsor=1" + } + }, + "node_modules/p-memoize/node_modules/mimic-fn": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-3.1.0.tgz", + "integrity": "sha512-Ysbi9uYW9hFyfrThdDEQuykN4Ey6BuwPD2kpI5ES/nFTDn/98yxYNLZJcgUAKPT/mcrLLKaGzJR9YVxJrIdASQ==", + "engines": { + "node": ">=8" + } + }, "node_modules/p-timeout": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.0.0.tgz", @@ -10107,10 +10193,11 @@ } }, "got-ssrf": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.0.2.tgz", - "integrity": "sha512-9AkQhk97CXekxF1VLugiVBBHDQK96r/AgH0Z+iwgOW/fH97AblOhroLzblKnUwSkvQBFMqNacKCmUfsbc+Xq9w==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.1.0.tgz", + "integrity": "sha512-XsoB8S+0FqrnFGLu8M4HMZa3T+spFpVPQ0A/MDIl2U6KomrYbYUTWou3lGa7fvniqcP6R9dObkFZeAA5so8fNA==", "requires": { + "debug": "^4.3.2", "got": "^12.0.0-beta.4", "ipaddr.js": "^2.0.1" } @@ -11570,6 +11657,30 @@ "tmpl": "1.0.x" } }, + "map-age-cleaner": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz", + "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==", + "requires": { + "p-defer": "^1.0.0" + } + }, + "mem": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/mem/-/mem-9.0.1.tgz", + "integrity": "sha512-f4uEX3Ley9FZqcFIRSBr2q43x1bJQeDvsxgkSN/BPnA7jY9Aue4sBU2dsjmpDwiaY/QY1maNCeosbUHQWzzdQw==", + "requires": { + "map-age-cleaner": "^0.1.3", + "mimic-fn": "^4.0.0" + }, + "dependencies": { + "mimic-fn": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz", + "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==" + } + } + }, "memorystream": { "version": "0.3.1", "resolved": "https://registry.npmjs.org/memorystream/-/memorystream-0.3.1.tgz", @@ -11993,6 +12104,11 @@ "resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz", "integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==" }, + "p-defer": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-defer/-/p-defer-1.0.0.tgz", + "integrity": "sha1-n26xgvbJqozXQwBKfU+WsZaw+ww=" + }, "p-each-series": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz", @@ -12026,6 +12142,31 @@ "aggregate-error": "^3.0.0" } }, + "p-memoize": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/p-memoize/-/p-memoize-4.0.1.tgz", + "integrity": "sha512-km0sP12uE0dOZ5qP+s7kGVf07QngxyG0gS8sYFvFWhqlgzOsSy+m71aUejf/0akxj5W7gE//2G74qTv6b4iMog==", + "requires": { + "mem": "^6.0.1", + "mimic-fn": "^3.0.0" + }, + "dependencies": { + "mem": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/mem/-/mem-6.1.1.tgz", + "integrity": "sha512-Ci6bIfq/UgcxPTYa8dQQ5FY3BzKkT894bwXWXxC/zqs0XgMO2cT20CGkOqda7gZNkmK5VP4x89IGZ6K7hfbn3Q==", + "requires": { + "map-age-cleaner": "^0.1.3", + "mimic-fn": "^3.0.0" + } + }, + "mimic-fn": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-3.1.0.tgz", + "integrity": "sha512-Ysbi9uYW9hFyfrThdDEQuykN4Ey6BuwPD2kpI5ES/nFTDn/98yxYNLZJcgUAKPT/mcrLLKaGzJR9YVxJrIdASQ==" + } + } + }, "p-timeout": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.0.0.tgz", diff --git a/package.json b/package.json index 8384247..c6945c6 100644 --- a/package.json +++ b/package.json @@ -20,11 +20,13 @@ "debug": "^4.3.2", "got": "^12.0.0-beta.4", "got-scraping": "^3.0.1", - "got-ssrf": "^1.0.2", + "got-ssrf": "^1.1.0", "ipaddr.js": "^2.0.1", "leven": "^4.0.0", "lodash": "^4.17.21", + "mem": "^9.0.1", "normalize-url": "^7.0.1", + "p-memoize": "^4.0.1", "p-timeout": "^5.0.0", "quick-lru": "^6.0.1", "tld-extract": "^2.0.1" diff --git a/utils/__fixtures__/empty-cache.js b/utils/__fixtures__/empty-cache.js new file mode 100644 index 0000000..666808e --- /dev/null +++ b/utils/__fixtures__/empty-cache.js @@ -0,0 +1,21 @@ +export default class EmptyCache { + get() { + // noop + } + + set() { + // noop + } + + has() { + return false + } + + delete() { + // noop + } + + clear() { + // noop + } +} diff --git a/utils/__mocks__/strip-trackers.js b/utils/__mocks__/strip-trackers.js index 3b6bb6e..1465779 100644 --- a/utils/__mocks__/strip-trackers.js +++ b/utils/__mocks__/strip-trackers.js @@ -1,3 +1,5 @@ -export default function (url) { - return url // much function, very wow +export default function gen() { + return function clearUrl(url) { + return url // much function, very wow + } } diff --git a/utils/canonicize.js b/utils/canonicize.js index 036cdda..1eeb117 100644 --- a/utils/canonicize.js +++ b/utils/canonicize.js @@ -2,136 +2,145 @@ import cheerio from 'cheerio' import trim from 'lodash/trim.js' import leven from 'leven' import parseTld from 'tld-extract' +import mem from 'p-memoize' import urlIsAmp from './url-is-amp.js' import logger from './logger.js' const debug = logger('utils/canonicize.js') -// Look for the canonical link (also un-AMP-ifies the canonical link) -// Not writing a separate metascraper-canonical library for this, as the "standard" way of determining -// canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls -export default async function canonicizeHook(res) { - if (!res.request.options.context.normalize) { - debug('got.context.normalize not set; not canonicizing...') - return res - } - - const { normalize } = res.request.options.context - - // Normalize the "final" URL up front - const normalizedUrl = await normalize(res.url) - debug('Normalized res.url %s to %s', res.url, normalizedUrl) - - // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py - const $ = cheerio.load(res.body) - const matches = [] - - // 5.1: rel=canonical tag - $('link[rel=canonical]').each(function () { - const match = $(this).attr('href') - matches.push(match) - debug('Matched rel=canonical tag: %s', match) - }) - - // 5.2: rel=canonical HTTP header - if ('link' in res.headers) { - debug('"Link" header exists, searching for rel=canonical...') - - // We're looking for something like: - // Link: ; rel="canonical", ... - res.headers.link.split(',').forEach(linkHeader => { - const parts = linkHeader.split(';') - if (parts.length !== 2) { - debug('Not enough parts exist in the header: %s', linkHeader) - return - } +export default function canonicizeGen(normalize, memOpts) { + // Look for the canonical link (also un-AMP-ifies the canonical link) + // Not writing a separate metascraper-canonical library for this, as the "standard" way of determining + // canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls + async function getCanonical(res, normalizedUrl) { + // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py + const $ = cheerio.load(res.body) + const matches = [] + + // 5.1: rel=canonical tag + $('link[rel=canonical]').each(function () { + const match = $(this).attr('href') + matches.push(match) + debug('Matched rel=canonical tag: %s', match) + }) - const [linkStr, relStr] = parts - debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr) + // 5.2: rel=canonical HTTP header + if ('link' in res.headers) { + debug('"Link" header exists, searching for rel=canonical...') + + // We're looking for something like: + // Link: ; rel="canonical", ... + res.headers.link.split(',').forEach(linkHeader => { + const parts = linkHeader.split(';') + if (parts.length !== 2) { + debug('Not enough parts exist in the header: %s', linkHeader) + return + } + + const [linkStr, relStr] = parts + debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr) + + // rel="canonical", rel=canonical, rel canonical, etc. + const relStrLower = relStr.toLowerCase() + if (relStrLower.includes('rel') && relStrLower.includes('canonical')) { + // , https://example.com, etc. + const url = trim(linkStr.trim(), ['<', '>', ' ']) + matches.push(url) + debug('Found canonical in header: %s', url) + } + }) + } - // rel="canonical", rel=canonical, rel canonical, etc. - const relStrLower = relStr.toLowerCase() - if (relStrLower.includes('rel') && relStrLower.includes('canonical')) { - // , https://example.com, etc. - const url = trim(linkStr.trim(), ['<', '>', ' ']) - matches.push(url) - debug('Found canonical in header: %s', url) - } + // 5.3: AMP variant + $('a.amp-canurl').each(function () { + const match = $(this).attr('href') + matches.push(match) + debug('Found non-AMP variant: %s', match) }) - } - - // 5.3: AMP variant - $('a.amp-canurl').each(function () { - const match = $(this).attr('href') - matches.push(match) - debug('Found non-AMP variant: %s', match) - }) - // 5.4: OpenGraph - $('meta[property="og:url"]').each(function () { - const match = $(this).attr('content') - matches.push(match) - debug('Found OpenGraph og:url: %s', match) - }) + // 5.4: OpenGraph + $('meta[property="og:url"]').each(function () { + const match = $(this).attr('content') + matches.push(match) + debug('Found OpenGraph og:url: %s', match) + }) - // 5.5: Sitemap (I'm not doing this shit) - - // The only reason we want canonical is to make our job with normalization easier; - // So we need to make sure the canonical link IS for the url we're trying to normalize! - - const { hostname: domain } = new URL(normalizedUrl) - const { domain: baseDomain } = parseTld(normalizedUrl) - debug('Finding the best match for host %s and TLD %s...', domain, baseDomain) - - let result = normalizedUrl - let minDist = Number.POSITIVE_INFINITY - - for (const match of matches) { - let link = match - - // turn relative to absolute URL - if (match.startsWith('/')) link = `${domain}${match}` - debug('Considering match %s...', link) - - // Skip invalid links - try { - link = await normalize(link) - debug('Normalized match to %s', link) - - // Ensure that every match is a valid URL w/ a matching domain - // In this case, we're only matching the "top-level" domain - - // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com) - // so we want to include references to www.site.com (actually *prefer* those) - const { domain: matchDomain } = parseTld(link) - if (matchDomain !== baseDomain) { - debug( - 'The domain %s does not match the base domain %s', - matchDomain, - baseDomain - ) + // 5.5: Sitemap (I'm not doing this shit) + + // The only reason we want canonical is to make our job with normalization easier; + // So we need to make sure the canonical link IS for the url we're trying to normalize! + + const { hostname: domain } = new URL(normalizedUrl) + const { domain: baseDomain } = parseTld(normalizedUrl) + debug( + 'Finding the best match for host %s and TLD %s...', + domain, + baseDomain + ) + + let result = normalizedUrl + let minDist = Number.POSITIVE_INFINITY + + for (const match of matches) { + let link = match + + // turn relative to absolute URL + if (match.startsWith('/')) link = `${domain}${match}` + debug('Considering match %s...', link) + + // Skip invalid links + try { + link = await normalize(link) + debug('Normalized match to %s', link) + + // Ensure that every match is a valid URL w/ a matching domain + // In this case, we're only matching the "top-level" domain - + // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com) + // so we want to include references to www.site.com (actually *prefer* those) + const { domain: matchDomain } = parseTld(link) + if (matchDomain !== baseDomain) { + debug( + 'The domain %s does not match the base domain %s', + matchDomain, + baseDomain + ) + continue + } + + // Then, ensure that links aren't AMP'd + if (urlIsAmp(link)) { + debug('Link %s is AMP, skipping...', link) + continue + } + } catch (err) { + debug('Error %s while considering match %s', err, match) continue } - // Then, ensure that links aren't AMP'd - if (urlIsAmp(link)) { - debug('Link %s is AMP, skipping...', link) - continue + // Then, sort by similarity to the normalized URL of the page we ended up in + const dist = leven(normalizedUrl, link) + if (dist < minDist) { + minDist = dist + result = link } - } catch (err) { - debug('Error %s while considering match %s', err, match) - continue } - // Then, sort by similarity to the normalized URL of the page we ended up in - const dist = leven(normalizedUrl, link) - if (dist < minDist) { - minDist = dist - result = link - } + debug('Found best match %s', result) + return result } - res.url = result - debug('Found best match %s', result) + const memCanonical = mem(getCanonical, { + ...memOpts, + cacheKey: args => args[1] // we want to cache by the normalized url in order to raise the hit rate + }) + + return async function canonicizeHook(res) { + // Normalize the "final" URL up front + const normalizedUrl = await normalize(res.url) + debug('Normalized res.url %s to %s', res.url, normalizedUrl) - return res + res.url = await memCanonical(res, normalizedUrl) + + return res + } } diff --git a/utils/canonicize.test.js b/utils/canonicize.test.js index 9dbe9c8..64efcd0 100644 --- a/utils/canonicize.test.js +++ b/utils/canonicize.test.js @@ -1,18 +1,26 @@ import { expect, describe, it } from '@jest/globals' import got from 'got' import nock from 'nock' -import hook from './canonicize' +import hookGen from './canonicize' +import EmptyCache from './__fixtures__/empty-cache' nock.disableNetConnect() +async function mockNormalize(url) { + // just append protocol, do nothing else + return url.startsWith('http') || url.startsWith('https') + ? url + : `http://${url}` +} + describe('extracting canonical links', () => { const httpClient = got.extend({ - hooks: { afterResponse: [hook] }, - context: { - normalize: async url => - url.startsWith('http') || url.startsWith('https') - ? url - : `http://${url}` + hooks: { + afterResponse: [ + hookGen(mockNormalize, { + cache: new EmptyCache() + }) + ] } }) diff --git a/utils/http-client.js b/utils/http-client.js index 384dad9..4e9f900 100644 --- a/utils/http-client.js +++ b/utils/http-client.js @@ -1,16 +1,16 @@ import got from 'got' import { gotScraping } from 'got-scraping' import { gotSsrf } from 'got-ssrf' -import canonicizeHook from './canonicize.js' +import canonicizeHookGen from './canonicize.js' -export default function httpClient(gotOptions) { +export default function httpClient(normalize, gotOptions, canonicizeMemOpts) { return got .extend(gotOptions) .extend(gotSsrf) .extend(gotScraping) .extend({ hooks: { - afterResponse: [canonicizeHook] + afterResponse: [canonicizeHookGen(normalize, canonicizeMemOpts)] } }) } diff --git a/utils/normalize-url.js b/utils/normalize-url.js index f506b0a..f7a5cde 100644 --- a/utils/normalize-url.js +++ b/utils/normalize-url.js @@ -1,11 +1,18 @@ import normalizeUrl from 'normalize-url' -import stripTrackers from './strip-trackers.js' +import stripTrackersGen from './strip-trackers.js' import { URL } from 'url' import logger from './logger.js' const debug = logger('utils/normalize-url.js') -export default function gen(normalizeUrlOptions, dnsLookup, httpClient) { +export default function gen( + normalizeUrlOptions, + dnsLookup, + httpClient, + memOpts +) { + const stripTrackers = stripTrackersGen(memOpts) + return async function normalize(originalUrl) { // We default to non-www, https links const preferredOptions = { diff --git a/utils/normalize-url.test.js b/utils/normalize-url.test.js index c6d6be7..3eb7da0 100644 --- a/utils/normalize-url.test.js +++ b/utils/normalize-url.test.js @@ -1,7 +1,6 @@ import { expect, describe, it } from '@jest/globals' import gen from './normalize-url' - -process.env.SKIP_CLEARURLS = 1 +import EmptyCache from './__fixtures__/empty-cache' const normalize = gen( { stripHash: true, removeQueryParameters: [] }, @@ -14,7 +13,8 @@ const normalize = gen( // throw if site doesn't support HTTPS if (url === 'https://www.test3.com/asdf') throw new Error() } - } + }, + { cache: new EmptyCache() } ) describe('link normalization', () => { diff --git a/utils/strip-trackers.js b/utils/strip-trackers.js index 944d588..9bbd78d 100644 --- a/utils/strip-trackers.js +++ b/utils/strip-trackers.js @@ -1,11 +1,12 @@ import { URL } from 'url' +import mem from 'mem' import load from '../data/loader.js' import logger from './logger.js' const providers = load() const debug = logger('utils/strip-trackers.js') -export default function clearUrl(url) { +function clearUrl(url) { debug('Stripping trackers for %s', url) // Clean the given URL with the provided rules data. @@ -85,3 +86,7 @@ export default function clearUrl(url) { return url } + +export default function clearUrlGen(memOpts) { + return mem(clearUrl, memOpts) +} diff --git a/utils/strip-trackers.test.js b/utils/strip-trackers.test.js index 5c9d513..bdb4e14 100644 --- a/utils/strip-trackers.test.js +++ b/utils/strip-trackers.test.js @@ -1,5 +1,8 @@ import { expect, describe, it } from '@jest/globals' -import clearUrl from './strip-trackers' +import clearUrlGen from './strip-trackers' +import EmptyCache from './__fixtures__/empty-cache' + +const clearUrl = clearUrlGen({ cache: new EmptyCache() }) describe('stripping trackers', () => { it('blocks "complete providers"', () => {