-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrapper.js
57 lines (56 loc) · 1.77 KB
/
scrapper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
const scrape = require('website-scraper');
const {parseHTML} = require('./parseHTML');
const fs = require('fs');
const directory = "./annsummers";
const outputFile = "./data.json";
/**
* Use the sitemap to go to categories and then take the first items there.
* After we get the website parse all html files and produce json file which with links to static assets.
* Each file should be parsed asynchronously with fs.
* Check if directory exists so we don't scrape each time.
* Also check if json exists so we don't have to recalculate each time.
* DELETE DIRECTORY TO RESCRAPE.
*/
fs.stat(outputFile, function(err,json) {
if(err == null || err == false){
console.log("FILE EXISTS. Delete " + outputFile + " if you need to rerun.")
}else{
fs.stat(directory+'/', function(err,stat) {
console.log("Err: " + err);
if(err == null || err == false){
parseHTML(directory).then((result) => {
removeEmptyAndSave(result);
});
}else{
scrape({
urls: ['http://www.annsummers.com/sitemap'],
directory: directory,
sources: [
{selector: 'img', attr: 'src'}
],
recursive: true,
maxRecursiveDepth: 1,
}).then(()=>{
parseHTML(directory).then((result) => {
removeEmptyAndSave(result);
});
}).catch(console.log);
}
});
}
});
/**
* Must exit the process since promise does not finish.
*/
function removeEmptyAndSave(result){
fs.writeFile(outputFile, JSON.stringify(result),(err)=>{
if(!err){
console.log("JSON SAVED TO " + outputFile + " FILE BELOW");
console.log(result);
process.exit(0);
}else{
console.log("Didn't save file. Error: " + err);
process.exit(1);
}
});
}