-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.js
More file actions
68 lines (52 loc) · 2.09 KB
/
scrape.js
File metadata and controls
68 lines (52 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
const supercrawler = require('supercrawler');
const cheerio = require('cheerio')
const urls = [/* TODO, put urls in this array to start from */];
var crawler = new supercrawler.Crawler({
interval: 1000,
concurrentRequestsLimit: 5,
robotsCacheTime: 3600000,
});
// Get "Sitemaps:" directives from robots.txt
crawler.addHandler(supercrawler.handlers.robotsParser());
// Crawl sitemap files and extract their URLs.
crawler.addHandler(supercrawler.handlers.sitemapsParser());
// Pick up <a href> links from HTML documents
crawler.addHandler("text/html", supercrawler.handlers.htmlLinkParser());
// A function that takes in raw html and spits out the frequency of each
// tag in a parsable string.
function getLineForHTML(htmlData) {
var $ = cheerio.load(htmlData);
elementFreqs = $("*").get().map(ele => ele.name)
.reduce((eleFreqs, eleName) =>
eleFreqs.set(eleName, (eleFreqs.get(eleName) || 0) + 1), new Map());
// Generate a parseable line for the tag frequency
line = "";
elementFreqs.forEach((value, key) => {
line += `${key}: ${value}, `;
});
return line;
}
crawler.addHandler("text/html", function (context) {
// output the url on one line
console.log(context.url);
// then the actual data on the next
console.log(getLineForHTML(context.body));
});
// Add all the seed urls
var currentUrlList = crawler.getUrlList()
for (url of urls) {
currentUrlList.insertIfNotExists(new supercrawler.Url(url))
console.error("adding: " + url);
}
// We need milliseconds from hours for timeout
const hoursRunning = 0.0016;
const minutesRunning = hoursRunning * 60;
const secondsRunning = minutesRunning * 60;
// Let the user know how long it is going to run for
console.error("Running for: " + hoursRunning + " hours");
console.error(" " + minutesRunning + " minutes");
console.error(" " + secondsRunning + " seconds");
// Kick the crawler up
crawler.start();
// Stop the crawler after the alloted time
setTimeout(crawler.stop.bind(crawler), 1000 * secondsRunning);