-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl-instagram.js
79 lines (72 loc) · 2.3 KB
/
crawl-instagram.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
// Note: For instagram crawl and download images script, to make the script simple, `user-data-dir` flag is set to a browser session which has logged in instagram user session.
const puppeteer = require('puppeteer');
const request = require('request');
const fs = require('fs');
const path = require('path');
const counterFactory = (function counterFactory() {
return {
getNewCounterInstance() {
return (function counter() {
let _counter = 0;
return {
resetCounter: () => {
_counter = 0;
return _counter;
},
incrementCounter: () => {
_counter += 1;
return _counter;
}
};
})();
}
};
})();
const downloadedImageCounter = counterFactory.getNewCounterInstance();
const maxPages = 10;
const waitBetweenPageScrolls = 10000; // milliseconds
async function extractImageURLs(page) {
const images = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.KL4Bh > img')).map(img => {
return img.src;
});
});
return images;
}
async function download(imageURL) {
const extn = path.extname(imageURL.split('?')[0]);
const filename = `./downloads/${downloadedImageCounter.incrementCounter()}${extn}`;
request(imageURL).pipe(fs.createWriteStream(filename));
console.debug(`Saved: ${filename}`);
}
async function scrollToBottom(page) {
await page.evaluate(() => {
window.scrollTo(0, document.documentElement.scrollHeight);
});
}
(async () => {
const browser = await puppeteer.launch({
slowMo: 100,
// devtools: true,
headless: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
args: ['--user-data-dir=/Users/jojijaco/chrome-picasasword1']
});
const [page] = await browser.pages();
await page.goto('https://www.instagram.com/');
let iteration = 0;
async function browseInstagram() {
iteration += 1;
console.log(`scrolled page - ${iteration}`);
if (iteration >= maxPages) return;
await scrollToBottom(page);
await page.waitFor(waitBetweenPageScrolls);
await browseInstagram();
}
await browseInstagram();
const images = await extractImageURLs(page);
console.log(`${images.length} images in download queue.`);
images.forEach(imageURL => {
download(imageURL);
});
})();