Skip to content

Commit

Permalink
add memoization to tracker stripping and canonicization
Browse files Browse the repository at this point in the history
  • Loading branch information
JaneJeon committed Sep 14, 2021
1 parent 7e7606f commit e862d4b
Show file tree
Hide file tree
Showing 12 changed files with 358 additions and 146 deletions.
26 changes: 20 additions & 6 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import httpClientGen from './utils/http-client.js'
import dnsLookupGen from './utils/dns-lookup.js'
import logger from './utils/logger.js'
import CacheableLookup from 'cacheable-lookup'
import { gotSsrf } from 'got-ssrf'

const debug = logger('index.js')

Expand All @@ -20,14 +21,26 @@ export default (
timeout: {
request: 14000 // global timeout
},
cache: new QuickLRU({ maxSize: 1000 }),
dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 10000 }) })
cache: new QuickLRU({ maxSize: 10000 }),
dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 100000 }) })
},
timeoutMs = 15000 // global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit
timeoutMs = 15000,
canonicizeMemOpts = {
cache: new QuickLRU({ maxSize: 100000 }),
cachePromiseRejection: true,
maxAge: 86400000 // 24 hours
},
stripTrackersMemOpts = { cache: new QuickLRU({ maxSize: 100000 }) }
// The cache numbers are pulled from the most reliable source on the internet: my ass.
) => {
const httpClient = httpClientGen(gotOptions)
const dnsLookup = dnsLookupGen(gotOptions)
const normalize = normalizeUrl(normalizeUrlOptions, dnsLookup, httpClient)
const normalize = normalizeUrl(
normalizeUrlOptions,
dnsLookup,
gotSsrf.extend(gotOptions), // don't really need to mimic browser behaviour or canonicize shit
stripTrackersMemOpts
)
const httpClient = httpClientGen(normalize, gotOptions, canonicizeMemOpts)

// Normalize URL so that we can search by URL.
async function normalizePlus(url = '') {
Expand All @@ -39,12 +52,13 @@ export default (
debug('Normalization first pass: %s', url)

// 2. Follow redirects to deal with "intermediate" links (such as the links on google search results)
const res = await httpClient.get(link, { context: { normalize } })
const res = await httpClient.get(link)
debug('Normalization second pass: %s', res.url)

// At this point, the link will be completely normalized based on canonical links (if one exists)
return res.url
}

// global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit
return url => pTimeout(normalizePlus(url), timeoutMs)
}
155 changes: 148 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
"debug": "^4.3.2",
"got": "^12.0.0-beta.4",
"got-scraping": "^3.0.1",
"got-ssrf": "^1.0.2",
"got-ssrf": "^1.1.0",
"ipaddr.js": "^2.0.1",
"leven": "^4.0.0",
"lodash": "^4.17.21",
"mem": "^9.0.1",
"normalize-url": "^7.0.1",
"p-memoize": "^4.0.1",
"p-timeout": "^5.0.0",
"quick-lru": "^6.0.1",
"tld-extract": "^2.0.1"
Expand Down
21 changes: 21 additions & 0 deletions utils/__fixtures__/empty-cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export default class EmptyCache {
get() {
// noop
}

set() {
// noop
}

has() {
return false
}

delete() {
// noop
}

clear() {
// noop
}
}
6 changes: 4 additions & 2 deletions utils/__mocks__/strip-trackers.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
export default function (url) {
return url // much function, very wow
export default function gen() {
return function clearUrl(url) {
return url // much function, very wow
}
}
Loading

0 comments on commit e862d4b

Please sign in to comment.