diff --git a/index.js b/index.js
index 53044c3..c375067 100644
--- a/index.js
+++ b/index.js
@@ -5,6 +5,7 @@ import httpClientGen from './utils/http-client.js'
import dnsLookupGen from './utils/dns-lookup.js'
import logger from './utils/logger.js'
import CacheableLookup from 'cacheable-lookup'
+import { gotSsrf } from 'got-ssrf'
const debug = logger('index.js')
@@ -20,14 +21,26 @@ export default (
timeout: {
request: 14000 // global timeout
},
- cache: new QuickLRU({ maxSize: 1000 }),
- dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 10000 }) })
+ cache: new QuickLRU({ maxSize: 10000 }),
+ dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 100000 }) })
},
- timeoutMs = 15000 // global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit
+ timeoutMs = 15000,
+ canonicizeMemOpts = {
+ cache: new QuickLRU({ maxSize: 100000 }),
+ cachePromiseRejection: true,
+ maxAge: 86400000 // 24 hours
+ },
+ stripTrackersMemOpts = { cache: new QuickLRU({ maxSize: 100000 }) }
+ // The cache numbers are pulled from the most reliable source on the internet: my ass.
) => {
- const httpClient = httpClientGen(gotOptions)
const dnsLookup = dnsLookupGen(gotOptions)
- const normalize = normalizeUrl(normalizeUrlOptions, dnsLookup, httpClient)
+ const normalize = normalizeUrl(
+ normalizeUrlOptions,
+ dnsLookup,
+ gotSsrf.extend(gotOptions), // don't really need to mimic browser behaviour or canonicize shit
+ stripTrackersMemOpts
+ )
+ const httpClient = httpClientGen(normalize, gotOptions, canonicizeMemOpts)
// Normalize URL so that we can search by URL.
async function normalizePlus(url = '') {
@@ -39,12 +52,13 @@ export default (
debug('Normalization first pass: %s', url)
// 2. Follow redirects to deal with "intermediate" links (such as the links on google search results)
- const res = await httpClient.get(link, { context: { normalize } })
+ const res = await httpClient.get(link)
debug('Normalization second pass: %s', res.url)
// At this point, the link will be completely normalized based on canonical links (if one exists)
return res.url
}
+ // global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit
return url => pTimeout(normalizePlus(url), timeoutMs)
}
diff --git a/package-lock.json b/package-lock.json
index 69229eb..c783266 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -13,11 +13,13 @@
"debug": "^4.3.2",
"got": "^12.0.0-beta.4",
"got-scraping": "^3.0.1",
- "got-ssrf": "^1.0.2",
+ "got-ssrf": "^1.1.0",
"ipaddr.js": "^2.0.1",
"leven": "^4.0.0",
"lodash": "^4.17.21",
+ "mem": "^9.0.1",
"normalize-url": "^7.0.1",
+ "p-memoize": "^4.0.1",
"p-timeout": "^5.0.0",
"quick-lru": "^6.0.1",
"tld-extract": "^2.0.1"
@@ -3430,10 +3432,11 @@
}
},
"node_modules/got-ssrf": {
- "version": "1.0.2",
- "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.0.2.tgz",
- "integrity": "sha512-9AkQhk97CXekxF1VLugiVBBHDQK96r/AgH0Z+iwgOW/fH97AblOhroLzblKnUwSkvQBFMqNacKCmUfsbc+Xq9w==",
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.1.0.tgz",
+ "integrity": "sha512-XsoB8S+0FqrnFGLu8M4HMZa3T+spFpVPQ0A/MDIl2U6KomrYbYUTWou3lGa7fvniqcP6R9dObkFZeAA5so8fNA==",
"dependencies": {
+ "debug": "^4.3.2",
"got": "^12.0.0-beta.4",
"ipaddr.js": "^2.0.1"
},
@@ -5386,6 +5389,43 @@
"tmpl": "1.0.x"
}
},
+ "node_modules/map-age-cleaner": {
+ "version": "0.1.3",
+ "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz",
+ "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==",
+ "dependencies": {
+ "p-defer": "^1.0.0"
+ },
+ "engines": {
+ "node": ">=6"
+ }
+ },
+ "node_modules/mem": {
+ "version": "9.0.1",
+ "resolved": "https://registry.npmjs.org/mem/-/mem-9.0.1.tgz",
+ "integrity": "sha512-f4uEX3Ley9FZqcFIRSBr2q43x1bJQeDvsxgkSN/BPnA7jY9Aue4sBU2dsjmpDwiaY/QY1maNCeosbUHQWzzdQw==",
+ "dependencies": {
+ "map-age-cleaner": "^0.1.3",
+ "mimic-fn": "^4.0.0"
+ },
+ "engines": {
+ "node": ">=12.20"
+ },
+ "funding": {
+ "url": "https://github.com/sindresorhus/mem?sponsor=1"
+ }
+ },
+ "node_modules/mem/node_modules/mimic-fn": {
+ "version": "4.0.0",
+ "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz",
+ "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==",
+ "engines": {
+ "node": ">=12"
+ },
+ "funding": {
+ "url": "https://github.com/sponsors/sindresorhus"
+ }
+ },
"node_modules/memorystream": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/memorystream/-/memorystream-0.3.1.tgz",
@@ -5952,6 +5992,14 @@
"node": ">=8"
}
},
+ "node_modules/p-defer": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/p-defer/-/p-defer-1.0.0.tgz",
+ "integrity": "sha1-n26xgvbJqozXQwBKfU+WsZaw+ww=",
+ "engines": {
+ "node": ">=4"
+ }
+ },
"node_modules/p-each-series": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz",
@@ -6003,6 +6051,44 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
+ "node_modules/p-memoize": {
+ "version": "4.0.1",
+ "resolved": "https://registry.npmjs.org/p-memoize/-/p-memoize-4.0.1.tgz",
+ "integrity": "sha512-km0sP12uE0dOZ5qP+s7kGVf07QngxyG0gS8sYFvFWhqlgzOsSy+m71aUejf/0akxj5W7gE//2G74qTv6b4iMog==",
+ "dependencies": {
+ "mem": "^6.0.1",
+ "mimic-fn": "^3.0.0"
+ },
+ "engines": {
+ "node": ">=10"
+ },
+ "funding": {
+ "url": "https://github.com/sindresorhus/p-memoize?sponsor=1"
+ }
+ },
+ "node_modules/p-memoize/node_modules/mem": {
+ "version": "6.1.1",
+ "resolved": "https://registry.npmjs.org/mem/-/mem-6.1.1.tgz",
+ "integrity": "sha512-Ci6bIfq/UgcxPTYa8dQQ5FY3BzKkT894bwXWXxC/zqs0XgMO2cT20CGkOqda7gZNkmK5VP4x89IGZ6K7hfbn3Q==",
+ "dependencies": {
+ "map-age-cleaner": "^0.1.3",
+ "mimic-fn": "^3.0.0"
+ },
+ "engines": {
+ "node": ">=8"
+ },
+ "funding": {
+ "url": "https://github.com/sindresorhus/mem?sponsor=1"
+ }
+ },
+ "node_modules/p-memoize/node_modules/mimic-fn": {
+ "version": "3.1.0",
+ "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-3.1.0.tgz",
+ "integrity": "sha512-Ysbi9uYW9hFyfrThdDEQuykN4Ey6BuwPD2kpI5ES/nFTDn/98yxYNLZJcgUAKPT/mcrLLKaGzJR9YVxJrIdASQ==",
+ "engines": {
+ "node": ">=8"
+ }
+ },
"node_modules/p-timeout": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.0.0.tgz",
@@ -10107,10 +10193,11 @@
}
},
"got-ssrf": {
- "version": "1.0.2",
- "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.0.2.tgz",
- "integrity": "sha512-9AkQhk97CXekxF1VLugiVBBHDQK96r/AgH0Z+iwgOW/fH97AblOhroLzblKnUwSkvQBFMqNacKCmUfsbc+Xq9w==",
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/got-ssrf/-/got-ssrf-1.1.0.tgz",
+ "integrity": "sha512-XsoB8S+0FqrnFGLu8M4HMZa3T+spFpVPQ0A/MDIl2U6KomrYbYUTWou3lGa7fvniqcP6R9dObkFZeAA5so8fNA==",
"requires": {
+ "debug": "^4.3.2",
"got": "^12.0.0-beta.4",
"ipaddr.js": "^2.0.1"
}
@@ -11570,6 +11657,30 @@
"tmpl": "1.0.x"
}
},
+ "map-age-cleaner": {
+ "version": "0.1.3",
+ "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz",
+ "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==",
+ "requires": {
+ "p-defer": "^1.0.0"
+ }
+ },
+ "mem": {
+ "version": "9.0.1",
+ "resolved": "https://registry.npmjs.org/mem/-/mem-9.0.1.tgz",
+ "integrity": "sha512-f4uEX3Ley9FZqcFIRSBr2q43x1bJQeDvsxgkSN/BPnA7jY9Aue4sBU2dsjmpDwiaY/QY1maNCeosbUHQWzzdQw==",
+ "requires": {
+ "map-age-cleaner": "^0.1.3",
+ "mimic-fn": "^4.0.0"
+ },
+ "dependencies": {
+ "mimic-fn": {
+ "version": "4.0.0",
+ "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-4.0.0.tgz",
+ "integrity": "sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw=="
+ }
+ }
+ },
"memorystream": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/memorystream/-/memorystream-0.3.1.tgz",
@@ -11993,6 +12104,11 @@
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg=="
},
+ "p-defer": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/p-defer/-/p-defer-1.0.0.tgz",
+ "integrity": "sha1-n26xgvbJqozXQwBKfU+WsZaw+ww="
+ },
"p-each-series": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/p-each-series/-/p-each-series-2.2.0.tgz",
@@ -12026,6 +12142,31 @@
"aggregate-error": "^3.0.0"
}
},
+ "p-memoize": {
+ "version": "4.0.1",
+ "resolved": "https://registry.npmjs.org/p-memoize/-/p-memoize-4.0.1.tgz",
+ "integrity": "sha512-km0sP12uE0dOZ5qP+s7kGVf07QngxyG0gS8sYFvFWhqlgzOsSy+m71aUejf/0akxj5W7gE//2G74qTv6b4iMog==",
+ "requires": {
+ "mem": "^6.0.1",
+ "mimic-fn": "^3.0.0"
+ },
+ "dependencies": {
+ "mem": {
+ "version": "6.1.1",
+ "resolved": "https://registry.npmjs.org/mem/-/mem-6.1.1.tgz",
+ "integrity": "sha512-Ci6bIfq/UgcxPTYa8dQQ5FY3BzKkT894bwXWXxC/zqs0XgMO2cT20CGkOqda7gZNkmK5VP4x89IGZ6K7hfbn3Q==",
+ "requires": {
+ "map-age-cleaner": "^0.1.3",
+ "mimic-fn": "^3.0.0"
+ }
+ },
+ "mimic-fn": {
+ "version": "3.1.0",
+ "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-3.1.0.tgz",
+ "integrity": "sha512-Ysbi9uYW9hFyfrThdDEQuykN4Ey6BuwPD2kpI5ES/nFTDn/98yxYNLZJcgUAKPT/mcrLLKaGzJR9YVxJrIdASQ=="
+ }
+ }
+ },
"p-timeout": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-5.0.0.tgz",
diff --git a/package.json b/package.json
index 8384247..c6945c6 100644
--- a/package.json
+++ b/package.json
@@ -20,11 +20,13 @@
"debug": "^4.3.2",
"got": "^12.0.0-beta.4",
"got-scraping": "^3.0.1",
- "got-ssrf": "^1.0.2",
+ "got-ssrf": "^1.1.0",
"ipaddr.js": "^2.0.1",
"leven": "^4.0.0",
"lodash": "^4.17.21",
+ "mem": "^9.0.1",
"normalize-url": "^7.0.1",
+ "p-memoize": "^4.0.1",
"p-timeout": "^5.0.0",
"quick-lru": "^6.0.1",
"tld-extract": "^2.0.1"
diff --git a/utils/__fixtures__/empty-cache.js b/utils/__fixtures__/empty-cache.js
new file mode 100644
index 0000000..666808e
--- /dev/null
+++ b/utils/__fixtures__/empty-cache.js
@@ -0,0 +1,21 @@
+export default class EmptyCache {
+ get() {
+ // noop
+ }
+
+ set() {
+ // noop
+ }
+
+ has() {
+ return false
+ }
+
+ delete() {
+ // noop
+ }
+
+ clear() {
+ // noop
+ }
+}
diff --git a/utils/__mocks__/strip-trackers.js b/utils/__mocks__/strip-trackers.js
index 3b6bb6e..1465779 100644
--- a/utils/__mocks__/strip-trackers.js
+++ b/utils/__mocks__/strip-trackers.js
@@ -1,3 +1,5 @@
-export default function (url) {
- return url // much function, very wow
+export default function gen() {
+ return function clearUrl(url) {
+ return url // much function, very wow
+ }
}
diff --git a/utils/canonicize.js b/utils/canonicize.js
index 036cdda..1eeb117 100644
--- a/utils/canonicize.js
+++ b/utils/canonicize.js
@@ -2,136 +2,145 @@ import cheerio from 'cheerio'
import trim from 'lodash/trim.js'
import leven from 'leven'
import parseTld from 'tld-extract'
+import mem from 'p-memoize'
import urlIsAmp from './url-is-amp.js'
import logger from './logger.js'
const debug = logger('utils/canonicize.js')
-// Look for the canonical link (also un-AMP-ifies the canonical link)
-// Not writing a separate metascraper-canonical library for this, as the "standard" way of determining
-// canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls
-export default async function canonicizeHook(res) {
- if (!res.request.options.context.normalize) {
- debug('got.context.normalize not set; not canonicizing...')
- return res
- }
-
- const { normalize } = res.request.options.context
-
- // Normalize the "final" URL up front
- const normalizedUrl = await normalize(res.url)
- debug('Normalized res.url %s to %s', res.url, normalizedUrl)
-
- // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py
- const $ = cheerio.load(res.body)
- const matches = []
-
- // 5.1: rel=canonical tag
- $('link[rel=canonical]').each(function () {
- const match = $(this).attr('href')
- matches.push(match)
- debug('Matched rel=canonical tag: %s', match)
- })
-
- // 5.2: rel=canonical HTTP header
- if ('link' in res.headers) {
- debug('"Link" header exists, searching for rel=canonical...')
-
- // We're looking for something like:
- // Link: ; rel="canonical", ...
- res.headers.link.split(',').forEach(linkHeader => {
- const parts = linkHeader.split(';')
- if (parts.length !== 2) {
- debug('Not enough parts exist in the header: %s', linkHeader)
- return
- }
+export default function canonicizeGen(normalize, memOpts) {
+ // Look for the canonical link (also un-AMP-ifies the canonical link)
+ // Not writing a separate metascraper-canonical library for this, as the "standard" way of determining
+ // canonical link includes looking at the HTTP header: https://developers.google.com/search/docs/advanced/crawling/consolidate-duplicate-urls
+ async function getCanonical(res, normalizedUrl) {
+ // Ripped from https://github.com/KilledMufasa/AmputatorBot/blob/master/helpers/canonical_methods.py
+ const $ = cheerio.load(res.body)
+ const matches = []
+
+ // 5.1: rel=canonical tag
+ $('link[rel=canonical]').each(function () {
+ const match = $(this).attr('href')
+ matches.push(match)
+ debug('Matched rel=canonical tag: %s', match)
+ })
- const [linkStr, relStr] = parts
- debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr)
+ // 5.2: rel=canonical HTTP header
+ if ('link' in res.headers) {
+ debug('"Link" header exists, searching for rel=canonical...')
+
+ // We're looking for something like:
+ // Link: ; rel="canonical", ...
+ res.headers.link.split(',').forEach(linkHeader => {
+ const parts = linkHeader.split(';')
+ if (parts.length !== 2) {
+ debug('Not enough parts exist in the header: %s', linkHeader)
+ return
+ }
+
+ const [linkStr, relStr] = parts
+ debug('Extracted link fragment %s and rel fragment %s', linkStr, relStr)
+
+ // rel="canonical", rel=canonical, rel canonical, etc.
+ const relStrLower = relStr.toLowerCase()
+ if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
+ // , https://example.com, etc.
+ const url = trim(linkStr.trim(), ['<', '>', ' '])
+ matches.push(url)
+ debug('Found canonical in header: %s', url)
+ }
+ })
+ }
- // rel="canonical", rel=canonical, rel canonical, etc.
- const relStrLower = relStr.toLowerCase()
- if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
- // , https://example.com, etc.
- const url = trim(linkStr.trim(), ['<', '>', ' '])
- matches.push(url)
- debug('Found canonical in header: %s', url)
- }
+ // 5.3: AMP variant
+ $('a.amp-canurl').each(function () {
+ const match = $(this).attr('href')
+ matches.push(match)
+ debug('Found non-AMP variant: %s', match)
})
- }
-
- // 5.3: AMP variant
- $('a.amp-canurl').each(function () {
- const match = $(this).attr('href')
- matches.push(match)
- debug('Found non-AMP variant: %s', match)
- })
- // 5.4: OpenGraph
- $('meta[property="og:url"]').each(function () {
- const match = $(this).attr('content')
- matches.push(match)
- debug('Found OpenGraph og:url: %s', match)
- })
+ // 5.4: OpenGraph
+ $('meta[property="og:url"]').each(function () {
+ const match = $(this).attr('content')
+ matches.push(match)
+ debug('Found OpenGraph og:url: %s', match)
+ })
- // 5.5: Sitemap (I'm not doing this shit)
-
- // The only reason we want canonical is to make our job with normalization easier;
- // So we need to make sure the canonical link IS for the url we're trying to normalize!
-
- const { hostname: domain } = new URL(normalizedUrl)
- const { domain: baseDomain } = parseTld(normalizedUrl)
- debug('Finding the best match for host %s and TLD %s...', domain, baseDomain)
-
- let result = normalizedUrl
- let minDist = Number.POSITIVE_INFINITY
-
- for (const match of matches) {
- let link = match
-
- // turn relative to absolute URL
- if (match.startsWith('/')) link = `${domain}${match}`
- debug('Considering match %s...', link)
-
- // Skip invalid links
- try {
- link = await normalize(link)
- debug('Normalized match to %s', link)
-
- // Ensure that every match is a valid URL w/ a matching domain
- // In this case, we're only matching the "top-level" domain -
- // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
- // so we want to include references to www.site.com (actually *prefer* those)
- const { domain: matchDomain } = parseTld(link)
- if (matchDomain !== baseDomain) {
- debug(
- 'The domain %s does not match the base domain %s',
- matchDomain,
- baseDomain
- )
+ // 5.5: Sitemap (I'm not doing this shit)
+
+ // The only reason we want canonical is to make our job with normalization easier;
+ // So we need to make sure the canonical link IS for the url we're trying to normalize!
+
+ const { hostname: domain } = new URL(normalizedUrl)
+ const { domain: baseDomain } = parseTld(normalizedUrl)
+ debug(
+ 'Finding the best match for host %s and TLD %s...',
+ domain,
+ baseDomain
+ )
+
+ let result = normalizedUrl
+ let minDist = Number.POSITIVE_INFINITY
+
+ for (const match of matches) {
+ let link = match
+
+ // turn relative to absolute URL
+ if (match.startsWith('/')) link = `${domain}${match}`
+ debug('Considering match %s...', link)
+
+ // Skip invalid links
+ try {
+ link = await normalize(link)
+ debug('Normalized match to %s', link)
+
+ // Ensure that every match is a valid URL w/ a matching domain
+ // In this case, we're only matching the "top-level" domain -
+ // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
+ // so we want to include references to www.site.com (actually *prefer* those)
+ const { domain: matchDomain } = parseTld(link)
+ if (matchDomain !== baseDomain) {
+ debug(
+ 'The domain %s does not match the base domain %s',
+ matchDomain,
+ baseDomain
+ )
+ continue
+ }
+
+ // Then, ensure that links aren't AMP'd
+ if (urlIsAmp(link)) {
+ debug('Link %s is AMP, skipping...', link)
+ continue
+ }
+ } catch (err) {
+ debug('Error %s while considering match %s', err, match)
continue
}
- // Then, ensure that links aren't AMP'd
- if (urlIsAmp(link)) {
- debug('Link %s is AMP, skipping...', link)
- continue
+ // Then, sort by similarity to the normalized URL of the page we ended up in
+ const dist = leven(normalizedUrl, link)
+ if (dist < minDist) {
+ minDist = dist
+ result = link
}
- } catch (err) {
- debug('Error %s while considering match %s', err, match)
- continue
}
- // Then, sort by similarity to the normalized URL of the page we ended up in
- const dist = leven(normalizedUrl, link)
- if (dist < minDist) {
- minDist = dist
- result = link
- }
+ debug('Found best match %s', result)
+ return result
}
- res.url = result
- debug('Found best match %s', result)
+ const memCanonical = mem(getCanonical, {
+ ...memOpts,
+ cacheKey: args => args[1] // we want to cache by the normalized url in order to raise the hit rate
+ })
+
+ return async function canonicizeHook(res) {
+ // Normalize the "final" URL up front
+ const normalizedUrl = await normalize(res.url)
+ debug('Normalized res.url %s to %s', res.url, normalizedUrl)
- return res
+ res.url = await memCanonical(res, normalizedUrl)
+
+ return res
+ }
}
diff --git a/utils/canonicize.test.js b/utils/canonicize.test.js
index 9dbe9c8..64efcd0 100644
--- a/utils/canonicize.test.js
+++ b/utils/canonicize.test.js
@@ -1,18 +1,26 @@
import { expect, describe, it } from '@jest/globals'
import got from 'got'
import nock from 'nock'
-import hook from './canonicize'
+import hookGen from './canonicize'
+import EmptyCache from './__fixtures__/empty-cache'
nock.disableNetConnect()
+async function mockNormalize(url) {
+ // just append protocol, do nothing else
+ return url.startsWith('http') || url.startsWith('https')
+ ? url
+ : `http://${url}`
+}
+
describe('extracting canonical links', () => {
const httpClient = got.extend({
- hooks: { afterResponse: [hook] },
- context: {
- normalize: async url =>
- url.startsWith('http') || url.startsWith('https')
- ? url
- : `http://${url}`
+ hooks: {
+ afterResponse: [
+ hookGen(mockNormalize, {
+ cache: new EmptyCache()
+ })
+ ]
}
})
diff --git a/utils/http-client.js b/utils/http-client.js
index 384dad9..4e9f900 100644
--- a/utils/http-client.js
+++ b/utils/http-client.js
@@ -1,16 +1,16 @@
import got from 'got'
import { gotScraping } from 'got-scraping'
import { gotSsrf } from 'got-ssrf'
-import canonicizeHook from './canonicize.js'
+import canonicizeHookGen from './canonicize.js'
-export default function httpClient(gotOptions) {
+export default function httpClient(normalize, gotOptions, canonicizeMemOpts) {
return got
.extend(gotOptions)
.extend(gotSsrf)
.extend(gotScraping)
.extend({
hooks: {
- afterResponse: [canonicizeHook]
+ afterResponse: [canonicizeHookGen(normalize, canonicizeMemOpts)]
}
})
}
diff --git a/utils/normalize-url.js b/utils/normalize-url.js
index f506b0a..f7a5cde 100644
--- a/utils/normalize-url.js
+++ b/utils/normalize-url.js
@@ -1,11 +1,18 @@
import normalizeUrl from 'normalize-url'
-import stripTrackers from './strip-trackers.js'
+import stripTrackersGen from './strip-trackers.js'
import { URL } from 'url'
import logger from './logger.js'
const debug = logger('utils/normalize-url.js')
-export default function gen(normalizeUrlOptions, dnsLookup, httpClient) {
+export default function gen(
+ normalizeUrlOptions,
+ dnsLookup,
+ httpClient,
+ memOpts
+) {
+ const stripTrackers = stripTrackersGen(memOpts)
+
return async function normalize(originalUrl) {
// We default to non-www, https links
const preferredOptions = {
diff --git a/utils/normalize-url.test.js b/utils/normalize-url.test.js
index c6d6be7..3eb7da0 100644
--- a/utils/normalize-url.test.js
+++ b/utils/normalize-url.test.js
@@ -1,7 +1,6 @@
import { expect, describe, it } from '@jest/globals'
import gen from './normalize-url'
-
-process.env.SKIP_CLEARURLS = 1
+import EmptyCache from './__fixtures__/empty-cache'
const normalize = gen(
{ stripHash: true, removeQueryParameters: [] },
@@ -14,7 +13,8 @@ const normalize = gen(
// throw if site doesn't support HTTPS
if (url === 'https://www.test3.com/asdf') throw new Error()
}
- }
+ },
+ { cache: new EmptyCache() }
)
describe('link normalization', () => {
diff --git a/utils/strip-trackers.js b/utils/strip-trackers.js
index 944d588..9bbd78d 100644
--- a/utils/strip-trackers.js
+++ b/utils/strip-trackers.js
@@ -1,11 +1,12 @@
import { URL } from 'url'
+import mem from 'mem'
import load from '../data/loader.js'
import logger from './logger.js'
const providers = load()
const debug = logger('utils/strip-trackers.js')
-export default function clearUrl(url) {
+function clearUrl(url) {
debug('Stripping trackers for %s', url)
// Clean the given URL with the provided rules data.
@@ -85,3 +86,7 @@ export default function clearUrl(url) {
return url
}
+
+export default function clearUrlGen(memOpts) {
+ return mem(clearUrl, memOpts)
+}
diff --git a/utils/strip-trackers.test.js b/utils/strip-trackers.test.js
index 5c9d513..bdb4e14 100644
--- a/utils/strip-trackers.test.js
+++ b/utils/strip-trackers.test.js
@@ -1,5 +1,8 @@
import { expect, describe, it } from '@jest/globals'
-import clearUrl from './strip-trackers'
+import clearUrlGen from './strip-trackers'
+import EmptyCache from './__fixtures__/empty-cache'
+
+const clearUrl = clearUrlGen({ cache: new EmptyCache() })
describe('stripping trackers', () => {
it('blocks "complete providers"', () => {