-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawlData.js
More file actions
118 lines (102 loc) · 3.23 KB
/
crawlData.js
File metadata and controls
118 lines (102 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
require('lodash');
const axios = require("axios");
const cheerio = require("cheerio");
const News = require("./models/news");
const { Mongoose } = require('mongoose');
const unidecode = require('unidecode');
// require('./Mongodb')
const root_site = "https://baomoi.com"
const sendResponse = res => async request => {
return await request
.then(data => res.json({ status: "success", data }))
.catch(({ status: code = 500 }) =>
res.status(code).json({ status: "failure", code, message: code == 404 ? 'Not found.' : 'Request failed.' })
);
};
const fetchHtmlFromUrl = async url => {
return await axios
.get(url)
.then(response => cheerio.load(response.data, { decodeEntities: false }))
.catch(error => {
error.status = (error.response && error.response.status) || 500;
throw error;
});
}
// fetchHtmlFromUrl(site)
// .then($ => {
// fruits = []
// $('div.timeline div.story a.cache').each((i, e) => {
// fruits.push(site + $(e).attr('href'));
// });
// return fruits
// })
const scrawl = async url => {
const $ = await fetchHtmlFromUrl(url);
const sites = [];
$('div.timeline div.story:not(.story--video,.story--photo,.wait-render) a.cache').each((i, e) => {
sites.push(root_site + $(e).attr('href'));
});
const result = await Promise.all(sites.map(async site => await getContentPage(site)))
return result
}
const getContentPage = async url => {
const $ = await fetchHtmlFromUrl(url);
title = $('h1.article__header').text()
dated = new Date($('div.article__meta time').attr('datetime'))
category = $('div.breadcrumb a.cate').first().text()
summary = $('div.article__sapo').text()
content = $('div.article__body').html()
auth = $('p.body-author').text()
site = root_site
source = $('p.bm-source a').attr('href');
thumbnail= $('div.article p.body-image img').first().attr('src')
return {
title,
dated,
category,
summary,
content,
auth,
site,
url,
source,
thumbnail
}
}
const run = async () => {
var data = await scrawl('https://baomoi.com' + '/tin-moi/trang1.epi')
News.insertMany(data).then(res => {
console.log(res.length);
}).catch(err => {
console.log(err);
})
}
// run()
const runtest = () => {
var data = {title:"Q.9 is love",
summary:"tang nhon phu a, quan 9 is my life",
content:"dit me quan 9",
url:"url",
source:"source"}
const keywords=["Q.9", "Quận 9", "Tăng Nhơn Phú A"]
const flat ='ui'
const regexs= keywords.map(key=>({
keyword: key,
regex: RegExp(StringToRegex(key),flat)
}))
const tags= regexs.filter(r=>r.regex.test(data.summary)).map(r=>r.keyword)
// let text = data.summary
// let result = regex.exec(data.summary)
console.log(tags);
}
const StringToRegex=(str)=>{
const text=str.trim().toLowerCase()
const textAscii= unidecode(text)
return '\\b'+text.split("").reduce((pre, cur, i) => pre + (textAscii.includes(cur) ? cur : `[${textAscii[i]}${cur}]`))+'\\b'
}
runtest()
// Mongoose.connection.open()
module.exports={
fetchHtmlFromUrl,
sendResponse
}