-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathcrawler.js
More file actions
58 lines (52 loc) · 1.54 KB
/
crawler.js
File metadata and controls
58 lines (52 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
const _ = require('lodash');
const db = require('./db');
const puppeteer = require('puppeteer');
const url = require('url');
const debug = {
crawl: require('debug')('crawler:crawl'),
page: require('debug')('crawler:page'),
};
const crawl = async (entry, options = {}) => {
debug.crawl('Crawler started');
let target = (await db.popUrl()) || { url: entry, radius: 0 };
const { maxRadius = Infinity } = options;
if (!target.url) {
debug.crawl('Nothing to crawl');
return;
}
const entryUrl = url.parse(target.url);
const browser = await puppeteer.launch();
const page = await browser.newPage();
debug.crawl('Puppeteer started');
let count = 0;
while (target) {
if (target.radius >= maxRadius) {
debug.page(`Max radius reached ${target.url} not scraped`);
} else {
count++;
debug.page(`Crawling: ${target.url}`);
await page.goto(target.url);
debug.page(`Page loaded`);
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map(
link => link.href
);
});
const outboundUrls = _.chain(links)
.filter(link => {
return url.parse(link).host === entryUrl.host;
})
.value();
debug.page(`Scraped ${outboundUrls.length} urls`);
await db.store({
outboundUrls,
radius: ++target.radius,
url: target.url,
});
}
target = await db.popUrl();
}
debug.crawl(`Crawler finished after crawling ${count} pages`);
browser.close();
};
module.exports = crawl;