forked from leaky-forms/leaky-forms-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawlerConductor.js
117 lines (102 loc) · 4.85 KB
/
crawlerConductor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
const os = require('os');
const cores = os.cpus().length;
const chalk = require('chalk').default;
const async = require('async');
const crawl = require('./crawler');
const URL = require('url').URL;
const {createTimer} = require('./helpers/timer');
const createDeferred = require('./helpers/deferred');
const downloadCustomChromium = require('./helpers/downloadCustomChromium');
// eslint-disable-next-line no-unused-vars
const BaseCollector = require('./collectors/BaseCollector');
const notABot = require('./helpers/notABot');
const MAX_NUMBER_OF_CRAWLERS = 38;// by trial and error there seems to be network bandwidth issues with more than 38 browsers.
const MAX_NUMBER_OF_RETRIES = 1;
/**
* @param {string} urlString
* @param {BaseCollector[]} dataCollectors
* @param {function} log
* @param {boolean} filterOutFirstParty
* @param {function(URL, import('./crawler').CollectResult): void} dataCallback
* @param {boolean} emulateMobile
* @param {string} proxyHost
* @param {boolean} antiBotDetection
* @param {string} executablePath
* @param {number} maxLoadTimeMs
* @param {number} extraExecutionTimeMs
* @param {string} outputPath
* @param {string} emailAddress
* @param {string} passwordValue
*/
async function crawlAndSaveData(urlString, dataCollectors, log, filterOutFirstParty, dataCallback, emulateMobile, proxyHost, antiBotDetection, executablePath, maxLoadTimeMs, extraExecutionTimeMs, outputPath, emailAddress, passwordValue) {
const url = new URL(urlString);
/**
* @type {function(...any):void}
*/
const prefixedLog = (...msg) => log(chalk.gray(`${url.hostname}:`), ...msg);
const data = await crawl(url, {
log: prefixedLog,
// @ts-ignore
collectors: dataCollectors.map(collector => new collector.constructor()),
filterOutFirstParty,
emulateMobile,
proxyHost,
runInEveryFrame: antiBotDetection ? notABot : undefined,
executablePath,
maxLoadTimeMs,
extraExecutionTimeMs,
outputPath,
emailAddress,
passwordValue
});
dataCallback(url, data);
}
/**
* @param {{urls: Array<string|{url:string,dataCollectors?:BaseCollector[]}>, dataCallback: function(URL, import('./crawler').CollectResult): void, dataCollectors?: BaseCollector[], failureCallback?: function(string, Error): void, numberOfCrawlers?: number, logFunction?: function, filterOutFirstParty: boolean, emulateMobile: boolean, proxyHost: string, antiBotDetection?: boolean, chromiumVersion?: string, maxLoadTimeMs?: number, extraExecutionTimeMs?: number, outputPath:string, emailAddress: string, passwordValue: string}} options
*/
module.exports = async options => {
const deferred = createDeferred();
const log = options.logFunction || (() => {});
const failureCallback = options.failureCallback || (() => {});
let numberOfCrawlers = options.numberOfCrawlers || Math.floor(cores * 0.8);
numberOfCrawlers = Math.min(MAX_NUMBER_OF_CRAWLERS, numberOfCrawlers, options.urls.length);
// Increase number of listeners so we have at least one listener for each async process
if (numberOfCrawlers > process.getMaxListeners()) {
process.setMaxListeners(numberOfCrawlers + 1);
}
log(chalk.cyan(`Number of crawlers: ${numberOfCrawlers}\n`));
/**
* @type {string}
*/
let executablePath;
if (options.chromiumVersion) {
executablePath = await downloadCustomChromium(log, options.chromiumVersion);
}
async.eachOfLimit(options.urls, numberOfCrawlers, (urlItem, idx, callback) => {
const urlString = (typeof urlItem === 'string') ? urlItem : urlItem.url;
let dataCollectors = options.dataCollectors;
// there can be a different set of collectors for every item
if ((typeof urlItem !== 'string') && urlItem.dataCollectors) {
dataCollectors = urlItem.dataCollectors;
}
log(chalk.cyan(`Processing entry #${Number(idx) + 1} (${urlString}).`));
const timer = createTimer();
const task = crawlAndSaveData.bind(null, urlString, dataCollectors, log, options.filterOutFirstParty, options.dataCallback, options.emulateMobile, options.proxyHost, (options.antiBotDetection !== false), executablePath, options.maxLoadTimeMs, options.extraExecutionTimeMs, options.outputPath, options.emailAddress, options.passwordValue);
async.retry(MAX_NUMBER_OF_RETRIES, task, err => {
if (err) {
log(chalk.red(`Max number of retries (${MAX_NUMBER_OF_RETRIES}) exceeded for "${urlString}".`));
failureCallback(urlString, err);
} else {
log(chalk.cyan(`Processing "${urlString}" took ${timer.getElapsedTime()}s.`));
}
callback();
});
}, err => {
if (err) {
deferred.reject(err);
} else {
deferred.resolve();
}
});
await deferred.promise;
};